Files
coordinator/tests/test_integration.py
T
m-senior-developer 0e44846032 Implement debate round orchestration (WYL-45)
New module: src/coordinator/orchestrator.py
- DEBATER_NAMES, JUDGE_NAME, DEBATER_PROMPTS, JUDGE_PROMPT_TEMPLATE hardcoded for v1
- Per-debater prompts tell each debater exactly which tool output to ground evidence in
- orchestrate_pending() is the main entry point called from watch_loop
- _start_round(): pending→running, posts debater mention comment, phase→awaiting_debaters
- _advance_awaiting_debaters(): polls for replies, handles timeout with partial evidence,
  posts judge comment, phase→awaiting_judge
- _advance_awaiting_judge(): polls for verdict; RACE FIX — update_issue_status() called
  BEFORE queue.update_status("done") so poll_once can never double-enqueue
- Detection: primary=author_id match, fallback=[{name} response]: content marker (enables tests)
- Restart-safe: phase field persisted on every mutation; in-flight rounds resume correctly

Extended src/coordinator/queue.py:
- Round gains phase, phase_entered_at, coordinator_comment_id, judge_comment_id fields
- DebateQueue.update_phase() and running() added
- All new fields default-empty so existing queue.json files load cleanly

Extended src/coordinator/multica_client.py:
- update_issue_status() convenience wrapper
- create_issue() for integration / smoke tests

Updated src/coordinator/__main__.py:
- _orchestrate_pending stub replaced with real import from orchestrator

Tests:
- tests/test_orchestrator.py: 32 new unit tests covering phase transitions, timeouts,
  race fix ordering, restart resume, full lifecycle
- tests/test_integration.py: @pytest.mark.integration test against real API
- smoke_test.py: standalone end-to-end script; ran against real API, verdict OK

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 21:43:17 +00:00

212 lines
8.0 KiB
Python

"""Integration tests for the debate round orchestration.
These tests hit the REAL multica API. They require the coordinator env vars:
COORDINATOR_SERVER_URL e.g. https://multica.example.com
COORDINATOR_WORKSPACE_ID
COORDINATOR_TOKEN
Run with:
python -m pytest tests/test_integration.py -m integration -v
The test creates a scratch issue, exercises the full orchestration lifecycle
with simulated debater/judge responses (posted via the coordinator's own token
using the content-based fallback detector), verifies the result, then cleans up.
"""
from __future__ import annotations
import logging
import os
import time
from pathlib import Path
import pytest
pytestmark = pytest.mark.integration
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture(scope="module")
def cfg():
"""Load Config from env. Skip module if vars missing."""
required = [
"COORDINATOR_SERVER_URL",
"COORDINATOR_WORKSPACE_ID",
"COORDINATOR_TOKEN",
]
missing = [k for k in required if not os.environ.get(k)]
if missing:
pytest.skip(
f"Integration test requires env vars: {', '.join(missing)}\n"
"Put them in ~/.coordinator/env or export them."
)
# Load env file if present (same as Config.from_env)
from coordinator.config import load_env_file
load_env_file()
from coordinator.config import Config
return Config.from_env()
@pytest.fixture(scope="module")
def client(cfg):
from coordinator.multica_client import MulticaClient
return MulticaClient(cfg.server_url, cfg.workspace_id, cfg.token)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _post_fake_debater_replies(client, issue_id: str):
"""Post content-marker comments for each debater via the coordinator token.
The orchestrator's content-based fallback detector accepts these as valid
debater evidence replies.
"""
from coordinator.orchestrator import DEBATER_NAMES
for name in DEBATER_NAMES:
client.post_comment(
issue_id,
f"[{name} response]: Integration-test evidence for {name}. "
"This comment simulates the agent's evidence analysis.",
)
def _post_fake_judge_verdict(client, issue_id: str, verdict: str = "ACCEPT"):
client.post_comment(
issue_id,
f"VERDICT: {verdict}\n\n"
"Integration test verdict. The implementation satisfies the acceptance criteria.",
)
# ---------------------------------------------------------------------------
# Integration test: full round lifecycle
# ---------------------------------------------------------------------------
@pytest.mark.integration
def test_full_debate_round_lifecycle(cfg, client, tmp_path):
"""End-to-end debate round lifecycle against the real API.
Steps:
1. Create scratch issue, set to in_review
2. poll_once → verify round enqueued
3. _start_round → verify coordinator comment posted on real issue
4. Post simulated debater responses (content-marker fallback)
5. _advance_awaiting_debaters → verify judge comment posted
6. Post simulated judge verdict
7. _advance_awaiting_judge → verify issue status updated (RACE FIX verified)
8. Cleanup
"""
from coordinator.__main__ import poll_once
from coordinator.orchestrator import (
DEBATER_NAMES,
_advance_awaiting_debaters,
_advance_awaiting_judge,
_start_round,
)
from coordinator.queue import DebateQueue
from coordinator.state import SeenState
logger = logging.getLogger("test.integration")
issue_id = None
try:
# --- 1. Create scratch issue ---
issue = client.create_issue(
title="[DEBATE-SMOKE-TEST] Integration test — auto-delete",
description=(
"This issue is created by the coordinator integration test. "
"It will be cleaned up automatically.\n\n"
"Commit: https://git.wylab.me/multica/coordinator/commit/test000"
),
)
issue_id = issue["id"]
logger.info("created scratch issue %s", issue_id)
# Set to in_review so the watcher picks it up
client.update_issue_status(issue_id, "in_review")
# --- 2. poll_once → enqueue round ---
state = SeenState.load(tmp_path / "seen.json")
queue = DebateQueue.load(tmp_path / "queue.json")
poll_once(client, state, queue, logger)
assert queue.pending(), "Expected round enqueued after poll_once"
round_ = queue.pending()[0]
assert round_.issue_id == issue_id
assert round_.status == "pending"
# --- 3. _start_round → debater comment posted ---
_start_round(round_, client, queue, cfg, logger)
assert round_.status == "running", f"Expected running, got {round_.status}"
assert round_.phase == "awaiting_debaters", f"Expected awaiting_debaters, got {round_.phase}"
assert round_.coordinator_comment_id, "coordinator_comment_id must be set"
# Verify comment appeared on the real issue
real_comments = client.list_comments(issue_id)
coord_comment = next(
(c for c in real_comments if c.get("id") == round_.coordinator_comment_id),
None,
)
assert coord_comment is not None, "Coordinator comment not found in issue comments"
for name in DEBATER_NAMES:
assert name in coord_comment["content"], f"{name} not mentioned in coordinator comment"
# --- 4. Post simulated debater responses ---
_post_fake_debater_replies(client, issue_id)
# --- 5. _advance_awaiting_debaters → judge comment posted ---
_advance_awaiting_debaters(round_, client, queue, cfg, logger)
assert round_.phase == "awaiting_judge", f"Expected awaiting_judge, got {round_.phase}"
assert round_.judge_comment_id, "judge_comment_id must be set"
real_comments = client.list_comments(issue_id)
judge_comment = next(
(c for c in real_comments if c.get("id") == round_.judge_comment_id),
None,
)
assert judge_comment is not None, "Judge comment not found in issue comments"
assert "Verdict requested" in judge_comment["content"]
# --- 6. Post simulated judge verdict ---
_post_fake_judge_verdict(client, issue_id, verdict="ACCEPT")
# --- 7. _advance_awaiting_judge → issue status updated ---
_advance_awaiting_judge(round_, client, queue, cfg, logger)
assert round_.status == "done", f"Expected done, got {round_.status}"
assert round_.phase == "accepted", f"Expected accepted, got {round_.phase}"
# Verify issue status via real API (not just in-memory)
refreshed = client.get_issue(issue_id)
assert refreshed.get("status") == "done", (
f"Issue status should be 'done' after ACCEPT verdict, "
f"got {refreshed.get('status')!r}"
)
# RACE FIX: after _advance_awaiting_judge completes, issue is no longer in_review,
# so poll_once cannot double-enqueue this round.
# Note: other in_review issues in the workspace may also get enqueued — that's OK.
poll_once(client, state, queue, logger)
our_rounds = [r for r in queue.rounds if r.issue_id == issue_id]
assert len(our_rounds) == 1, (
f"poll_once created a second round for issue {issue_id} (double-enqueue)"
)
logger.info("VERDICT OK — full lifecycle passed")
finally:
# --- 8. Cleanup ---
if issue_id:
try:
client.update_issue_status(issue_id, "done")
logger.info("cleanup: issue %s set to done", issue_id)
except Exception as exc:
logger.warning("cleanup failed for issue %s: %s", issue_id, exc)