0e44846032
New module: src/coordinator/orchestrator.py
- DEBATER_NAMES, JUDGE_NAME, DEBATER_PROMPTS, JUDGE_PROMPT_TEMPLATE hardcoded for v1
- Per-debater prompts tell each debater exactly which tool output to ground evidence in
- orchestrate_pending() is the main entry point called from watch_loop
- _start_round(): pending→running, posts debater mention comment, phase→awaiting_debaters
- _advance_awaiting_debaters(): polls for replies, handles timeout with partial evidence,
posts judge comment, phase→awaiting_judge
- _advance_awaiting_judge(): polls for verdict; RACE FIX — update_issue_status() called
BEFORE queue.update_status("done") so poll_once can never double-enqueue
- Detection: primary=author_id match, fallback=[{name} response]: content marker (enables tests)
- Restart-safe: phase field persisted on every mutation; in-flight rounds resume correctly
Extended src/coordinator/queue.py:
- Round gains phase, phase_entered_at, coordinator_comment_id, judge_comment_id fields
- DebateQueue.update_phase() and running() added
- All new fields default-empty so existing queue.json files load cleanly
Extended src/coordinator/multica_client.py:
- update_issue_status() convenience wrapper
- create_issue() for integration / smoke tests
Updated src/coordinator/__main__.py:
- _orchestrate_pending stub replaced with real import from orchestrator
Tests:
- tests/test_orchestrator.py: 32 new unit tests covering phase transitions, timeouts,
race fix ordering, restart resume, full lifecycle
- tests/test_integration.py: @pytest.mark.integration test against real API
- smoke_test.py: standalone end-to-end script; ran against real API, verdict OK
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
212 lines
8.0 KiB
Python
212 lines
8.0 KiB
Python
"""Integration tests for the debate round orchestration.
|
|
|
|
These tests hit the REAL multica API. They require the coordinator env vars:
|
|
|
|
COORDINATOR_SERVER_URL e.g. https://multica.example.com
|
|
COORDINATOR_WORKSPACE_ID
|
|
COORDINATOR_TOKEN
|
|
|
|
Run with:
|
|
python -m pytest tests/test_integration.py -m integration -v
|
|
|
|
The test creates a scratch issue, exercises the full orchestration lifecycle
|
|
with simulated debater/judge responses (posted via the coordinator's own token
|
|
using the content-based fallback detector), verifies the result, then cleans up.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
pytestmark = pytest.mark.integration
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.fixture(scope="module")
|
|
def cfg():
|
|
"""Load Config from env. Skip module if vars missing."""
|
|
required = [
|
|
"COORDINATOR_SERVER_URL",
|
|
"COORDINATOR_WORKSPACE_ID",
|
|
"COORDINATOR_TOKEN",
|
|
]
|
|
missing = [k for k in required if not os.environ.get(k)]
|
|
if missing:
|
|
pytest.skip(
|
|
f"Integration test requires env vars: {', '.join(missing)}\n"
|
|
"Put them in ~/.coordinator/env or export them."
|
|
)
|
|
# Load env file if present (same as Config.from_env)
|
|
from coordinator.config import load_env_file
|
|
load_env_file()
|
|
from coordinator.config import Config
|
|
return Config.from_env()
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def client(cfg):
|
|
from coordinator.multica_client import MulticaClient
|
|
return MulticaClient(cfg.server_url, cfg.workspace_id, cfg.token)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _post_fake_debater_replies(client, issue_id: str):
|
|
"""Post content-marker comments for each debater via the coordinator token.
|
|
|
|
The orchestrator's content-based fallback detector accepts these as valid
|
|
debater evidence replies.
|
|
"""
|
|
from coordinator.orchestrator import DEBATER_NAMES
|
|
for name in DEBATER_NAMES:
|
|
client.post_comment(
|
|
issue_id,
|
|
f"[{name} response]: Integration-test evidence for {name}. "
|
|
"This comment simulates the agent's evidence analysis.",
|
|
)
|
|
|
|
|
|
def _post_fake_judge_verdict(client, issue_id: str, verdict: str = "ACCEPT"):
|
|
client.post_comment(
|
|
issue_id,
|
|
f"VERDICT: {verdict}\n\n"
|
|
"Integration test verdict. The implementation satisfies the acceptance criteria.",
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Integration test: full round lifecycle
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.integration
|
|
def test_full_debate_round_lifecycle(cfg, client, tmp_path):
|
|
"""End-to-end debate round lifecycle against the real API.
|
|
|
|
Steps:
|
|
1. Create scratch issue, set to in_review
|
|
2. poll_once → verify round enqueued
|
|
3. _start_round → verify coordinator comment posted on real issue
|
|
4. Post simulated debater responses (content-marker fallback)
|
|
5. _advance_awaiting_debaters → verify judge comment posted
|
|
6. Post simulated judge verdict
|
|
7. _advance_awaiting_judge → verify issue status updated (RACE FIX verified)
|
|
8. Cleanup
|
|
"""
|
|
from coordinator.__main__ import poll_once
|
|
from coordinator.orchestrator import (
|
|
DEBATER_NAMES,
|
|
_advance_awaiting_debaters,
|
|
_advance_awaiting_judge,
|
|
_start_round,
|
|
)
|
|
from coordinator.queue import DebateQueue
|
|
from coordinator.state import SeenState
|
|
|
|
logger = logging.getLogger("test.integration")
|
|
issue_id = None
|
|
|
|
try:
|
|
# --- 1. Create scratch issue ---
|
|
issue = client.create_issue(
|
|
title="[DEBATE-SMOKE-TEST] Integration test — auto-delete",
|
|
description=(
|
|
"This issue is created by the coordinator integration test. "
|
|
"It will be cleaned up automatically.\n\n"
|
|
"Commit: https://git.wylab.me/multica/coordinator/commit/test000"
|
|
),
|
|
)
|
|
issue_id = issue["id"]
|
|
logger.info("created scratch issue %s", issue_id)
|
|
|
|
# Set to in_review so the watcher picks it up
|
|
client.update_issue_status(issue_id, "in_review")
|
|
|
|
# --- 2. poll_once → enqueue round ---
|
|
state = SeenState.load(tmp_path / "seen.json")
|
|
queue = DebateQueue.load(tmp_path / "queue.json")
|
|
poll_once(client, state, queue, logger)
|
|
|
|
assert queue.pending(), "Expected round enqueued after poll_once"
|
|
round_ = queue.pending()[0]
|
|
assert round_.issue_id == issue_id
|
|
assert round_.status == "pending"
|
|
|
|
# --- 3. _start_round → debater comment posted ---
|
|
_start_round(round_, client, queue, cfg, logger)
|
|
|
|
assert round_.status == "running", f"Expected running, got {round_.status}"
|
|
assert round_.phase == "awaiting_debaters", f"Expected awaiting_debaters, got {round_.phase}"
|
|
assert round_.coordinator_comment_id, "coordinator_comment_id must be set"
|
|
|
|
# Verify comment appeared on the real issue
|
|
real_comments = client.list_comments(issue_id)
|
|
coord_comment = next(
|
|
(c for c in real_comments if c.get("id") == round_.coordinator_comment_id),
|
|
None,
|
|
)
|
|
assert coord_comment is not None, "Coordinator comment not found in issue comments"
|
|
for name in DEBATER_NAMES:
|
|
assert name in coord_comment["content"], f"{name} not mentioned in coordinator comment"
|
|
|
|
# --- 4. Post simulated debater responses ---
|
|
_post_fake_debater_replies(client, issue_id)
|
|
|
|
# --- 5. _advance_awaiting_debaters → judge comment posted ---
|
|
_advance_awaiting_debaters(round_, client, queue, cfg, logger)
|
|
|
|
assert round_.phase == "awaiting_judge", f"Expected awaiting_judge, got {round_.phase}"
|
|
assert round_.judge_comment_id, "judge_comment_id must be set"
|
|
|
|
real_comments = client.list_comments(issue_id)
|
|
judge_comment = next(
|
|
(c for c in real_comments if c.get("id") == round_.judge_comment_id),
|
|
None,
|
|
)
|
|
assert judge_comment is not None, "Judge comment not found in issue comments"
|
|
assert "Verdict requested" in judge_comment["content"]
|
|
|
|
# --- 6. Post simulated judge verdict ---
|
|
_post_fake_judge_verdict(client, issue_id, verdict="ACCEPT")
|
|
|
|
# --- 7. _advance_awaiting_judge → issue status updated ---
|
|
_advance_awaiting_judge(round_, client, queue, cfg, logger)
|
|
|
|
assert round_.status == "done", f"Expected done, got {round_.status}"
|
|
assert round_.phase == "accepted", f"Expected accepted, got {round_.phase}"
|
|
|
|
# Verify issue status via real API (not just in-memory)
|
|
refreshed = client.get_issue(issue_id)
|
|
assert refreshed.get("status") == "done", (
|
|
f"Issue status should be 'done' after ACCEPT verdict, "
|
|
f"got {refreshed.get('status')!r}"
|
|
)
|
|
|
|
# RACE FIX: after _advance_awaiting_judge completes, issue is no longer in_review,
|
|
# so poll_once cannot double-enqueue this round.
|
|
# Note: other in_review issues in the workspace may also get enqueued — that's OK.
|
|
poll_once(client, state, queue, logger)
|
|
our_rounds = [r for r in queue.rounds if r.issue_id == issue_id]
|
|
assert len(our_rounds) == 1, (
|
|
f"poll_once created a second round for issue {issue_id} (double-enqueue)"
|
|
)
|
|
|
|
logger.info("VERDICT OK — full lifecycle passed")
|
|
|
|
finally:
|
|
# --- 8. Cleanup ---
|
|
if issue_id:
|
|
try:
|
|
client.update_issue_status(issue_id, "done")
|
|
logger.info("cleanup: issue %s set to done", issue_id)
|
|
except Exception as exc:
|
|
logger.warning("cleanup failed for issue %s: %s", issue_id, exc)
|