coordinator/tests/test_integration.py

"""Integration tests for the debate round orchestration.

These tests hit the REAL multica API.  They require the coordinator env vars:

    COORDINATOR_SERVER_URL   e.g. https://multica.example.com
    COORDINATOR_WORKSPACE_ID
    COORDINATOR_TOKEN

Run with:
    python -m pytest tests/test_integration.py -m integration -v

The test creates a scratch issue, exercises the full orchestration lifecycle
with simulated debater/judge responses (posted via the coordinator's own token
using the content-based fallback detector), verifies the result, then cleans up.
"""
from __future__ import annotations

import logging
import os
import time
from pathlib import Path

import pytest

pytestmark = pytest.mark.integration


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

@pytest.fixture(scope="module")
def cfg():
    """Load Config from env.  Skip module if vars missing."""
    required = [
        "COORDINATOR_SERVER_URL",
        "COORDINATOR_WORKSPACE_ID",
        "COORDINATOR_TOKEN",
    ]
    missing = [k for k in required if not os.environ.get(k)]
    if missing:
        pytest.skip(
            f"Integration test requires env vars: {', '.join(missing)}\n"
            "Put them in ~/.coordinator/env or export them."
        )
    # Load env file if present (same as Config.from_env)
    from coordinator.config import load_env_file
    load_env_file()
    from coordinator.config import Config
    return Config.from_env()


@pytest.fixture(scope="module")
def client(cfg):
    from coordinator.multica_client import MulticaClient
    return MulticaClient(cfg.server_url, cfg.workspace_id, cfg.token)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _post_fake_debater_replies(client, issue_id: str):
    """Post content-marker comments for each debater via the coordinator token.

    The orchestrator's content-based fallback detector accepts these as valid
    debater evidence replies.
    """
    from coordinator.orchestrator import DEBATER_NAMES
    for name in DEBATER_NAMES:
        client.post_comment(
            issue_id,
            f"[{name} response]: Integration-test evidence for {name}. "
            "This comment simulates the agent's evidence analysis.",
        )


def _post_fake_judge_verdict(client, issue_id: str, verdict: str = "ACCEPT"):
    client.post_comment(
        issue_id,
        f"VERDICT: {verdict}\n\n"
        "Integration test verdict. The implementation satisfies the acceptance criteria.",
    )


# ---------------------------------------------------------------------------
# Integration test: full round lifecycle
# ---------------------------------------------------------------------------

@pytest.mark.integration
def test_full_debate_round_lifecycle(cfg, client, tmp_path):
    """End-to-end debate round lifecycle against the real API.

    Steps:
      1. Create scratch issue, set to in_review
      2. poll_once → verify round enqueued
      3. _start_round → verify coordinator comment posted on real issue
      4. Post simulated debater responses (content-marker fallback)
      5. _advance_awaiting_debaters → verify judge comment posted
      6. Post simulated judge verdict
      7. _advance_awaiting_judge → verify issue status updated (RACE FIX verified)
      8. Cleanup
    """
    from coordinator.__main__ import poll_once
    from coordinator.orchestrator import (
        DEBATER_NAMES,
        _advance_awaiting_debaters,
        _advance_awaiting_judge,
        _start_round,
    )
    from coordinator.queue import DebateQueue
    from coordinator.state import SeenState

    logger = logging.getLogger("test.integration")
    issue_id = None

    try:
        # --- 1. Create scratch issue ---
        issue = client.create_issue(
            title="[DEBATE-SMOKE-TEST] Integration test — auto-delete",
            description=(
                "This issue is created by the coordinator integration test. "
                "It will be cleaned up automatically.\n\n"
                "Commit: https://git.wylab.me/multica/coordinator/commit/test000"
            ),
        )
        issue_id = issue["id"]
        logger.info("created scratch issue %s", issue_id)

        # Set to in_review so the watcher picks it up
        client.update_issue_status(issue_id, "in_review")

        # --- 2. poll_once → enqueue round ---
        state = SeenState.load(tmp_path / "seen.json")
        queue = DebateQueue.load(tmp_path / "queue.json")
        poll_once(client, state, queue, logger)

        assert queue.pending(), "Expected round enqueued after poll_once"
        round_ = queue.pending()[0]
        assert round_.issue_id == issue_id
        assert round_.status == "pending"

        # --- 3. _start_round → debater comment posted ---
        _start_round(round_, client, queue, cfg, logger)

        assert round_.status == "running", f"Expected running, got {round_.status}"
        assert round_.phase == "awaiting_debaters", f"Expected awaiting_debaters, got {round_.phase}"
        assert round_.coordinator_comment_id, "coordinator_comment_id must be set"

        # Verify comment appeared on the real issue
        real_comments = client.list_comments(issue_id)
        coord_comment = next(
            (c for c in real_comments if c.get("id") == round_.coordinator_comment_id),
            None,
        )
        assert coord_comment is not None, "Coordinator comment not found in issue comments"
        for name in DEBATER_NAMES:
            assert name in coord_comment["content"], f"{name} not mentioned in coordinator comment"

        # --- 4. Post simulated debater responses ---
        _post_fake_debater_replies(client, issue_id)

        # --- 5. _advance_awaiting_debaters → judge comment posted ---
        _advance_awaiting_debaters(round_, client, queue, cfg, logger)

        assert round_.phase == "awaiting_judge", f"Expected awaiting_judge, got {round_.phase}"
        assert round_.judge_comment_id, "judge_comment_id must be set"

        real_comments = client.list_comments(issue_id)
        judge_comment = next(
            (c for c in real_comments if c.get("id") == round_.judge_comment_id),
            None,
        )
        assert judge_comment is not None, "Judge comment not found in issue comments"
        assert "Verdict requested" in judge_comment["content"]

        # --- 6. Post simulated judge verdict ---
        _post_fake_judge_verdict(client, issue_id, verdict="ACCEPT")

        # --- 7. _advance_awaiting_judge → issue status updated ---
        _advance_awaiting_judge(round_, client, queue, cfg, logger)

        assert round_.status == "done", f"Expected done, got {round_.status}"
        assert round_.phase == "accepted", f"Expected accepted, got {round_.phase}"

        # Verify issue status via real API (not just in-memory)
        refreshed = client.get_issue(issue_id)
        assert refreshed.get("status") == "done", (
            f"Issue status should be 'done' after ACCEPT verdict, "
            f"got {refreshed.get('status')!r}"
        )

        # RACE FIX: after _advance_awaiting_judge completes, issue is no longer in_review,
        # so poll_once cannot double-enqueue this round.
        # Note: other in_review issues in the workspace may also get enqueued — that's OK.
        poll_once(client, state, queue, logger)
        our_rounds = [r for r in queue.rounds if r.issue_id == issue_id]
        assert len(our_rounds) == 1, (
            f"poll_once created a second round for issue {issue_id} (double-enqueue)"
        )

        logger.info("VERDICT OK — full lifecycle passed")

    finally:
        # --- 8. Cleanup ---
        if issue_id:
            try:
                client.update_issue_status(issue_id, "done")
                logger.info("cleanup: issue %s set to done", issue_id)
            except Exception as exc:
                logger.warning("cleanup failed for issue %s: %s", issue_id, exc)