0e44846032
New module: src/coordinator/orchestrator.py
- DEBATER_NAMES, JUDGE_NAME, DEBATER_PROMPTS, JUDGE_PROMPT_TEMPLATE hardcoded for v1
- Per-debater prompts tell each debater exactly which tool output to ground evidence in
- orchestrate_pending() is the main entry point called from watch_loop
- _start_round(): pending→running, posts debater mention comment, phase→awaiting_debaters
- _advance_awaiting_debaters(): polls for replies, handles timeout with partial evidence,
posts judge comment, phase→awaiting_judge
- _advance_awaiting_judge(): polls for verdict; RACE FIX — update_issue_status() called
BEFORE queue.update_status("done") so poll_once can never double-enqueue
- Detection: primary=author_id match, fallback=[{name} response]: content marker (enables tests)
- Restart-safe: phase field persisted on every mutation; in-flight rounds resume correctly
Extended src/coordinator/queue.py:
- Round gains phase, phase_entered_at, coordinator_comment_id, judge_comment_id fields
- DebateQueue.update_phase() and running() added
- All new fields default-empty so existing queue.json files load cleanly
Extended src/coordinator/multica_client.py:
- update_issue_status() convenience wrapper
- create_issue() for integration / smoke tests
Updated src/coordinator/__main__.py:
- _orchestrate_pending stub replaced with real import from orchestrator
Tests:
- tests/test_orchestrator.py: 32 new unit tests covering phase transitions, timeouts,
race fix ordering, restart resume, full lifecycle
- tests/test_integration.py: @pytest.mark.integration test against real API
- smoke_test.py: standalone end-to-end script; ran against real API, verdict OK
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
206 lines
7.4 KiB
Python
206 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Standalone smoke test for debate round orchestration (WYL-45).
|
|
|
|
Runs one full round end-to-end against the real multica API, cleans up,
|
|
and prints verdict OK or FAIL.
|
|
|
|
Usage:
|
|
python smoke_test.py
|
|
|
|
Requires env vars (or ~/.coordinator/env):
|
|
COORDINATOR_SERVER_URL
|
|
COORDINATOR_WORKSPACE_ID
|
|
COORDINATOR_TOKEN
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
|
|
def main() -> int:
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
stream=sys.stderr,
|
|
)
|
|
log = logging.getLogger("smoke_test")
|
|
|
|
# ---- load coordinator package ------------------------------------------
|
|
src = Path(__file__).parent / "src"
|
|
if src.is_dir():
|
|
sys.path.insert(0, str(src))
|
|
|
|
try:
|
|
from coordinator.config import Config, load_env_file
|
|
from coordinator.multica_client import MulticaClient
|
|
from coordinator.queue import DebateQueue
|
|
from coordinator.state import SeenState
|
|
from coordinator.__main__ import poll_once
|
|
from coordinator.orchestrator import (
|
|
DEBATER_NAMES,
|
|
_advance_awaiting_debaters,
|
|
_advance_awaiting_judge,
|
|
_start_round,
|
|
)
|
|
except ImportError as exc:
|
|
print(f"FAIL: cannot import coordinator package: {exc}", file=sys.stderr)
|
|
print(" Run 'pip install -e .' in the coordinator directory first.")
|
|
return 1
|
|
|
|
load_env_file()
|
|
required = ["COORDINATOR_SERVER_URL", "COORDINATOR_WORKSPACE_ID", "COORDINATOR_TOKEN"]
|
|
missing = [k for k in required if not os.environ.get(k)]
|
|
if missing:
|
|
print(f"FAIL: missing env vars: {', '.join(missing)}", file=sys.stderr)
|
|
return 1
|
|
|
|
try:
|
|
cfg = Config.from_env()
|
|
except SystemExit as exc:
|
|
print(f"FAIL: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
client = MulticaClient(cfg.server_url, cfg.workspace_id, cfg.token)
|
|
issue_id: str | None = None
|
|
failures: list[str] = []
|
|
|
|
def check(cond: bool, msg: str) -> None:
|
|
if not cond:
|
|
failures.append(msg)
|
|
log.error("ASSERTION FAILED: %s", msg)
|
|
|
|
with tempfile.TemporaryDirectory() as td:
|
|
tmp = Path(td)
|
|
state = SeenState.load(tmp / "seen.json")
|
|
queue = DebateQueue.load(tmp / "queue.json")
|
|
|
|
try:
|
|
# 1. Create scratch issue
|
|
log.info("creating scratch issue...")
|
|
issue = client.create_issue(
|
|
title="[DEBATE-SMOKE-TEST] WYL-45 smoke — auto-delete",
|
|
description=(
|
|
"Smoke test issue for WYL-45. Auto-deleted after test.\n\n"
|
|
"Commit: https://git.wylab.me/multica/coordinator/commit/smoke000"
|
|
),
|
|
)
|
|
issue_id = issue["id"]
|
|
log.info("created issue %s", issue_id)
|
|
|
|
# 2. Set in_review and enqueue via poll_once
|
|
client.update_issue_status(issue_id, "in_review")
|
|
poll_once(client, state, queue, log)
|
|
|
|
check(bool(queue.pending()), "round not enqueued after poll_once")
|
|
if not queue.pending():
|
|
return 1
|
|
round_ = queue.pending()[0]
|
|
check(round_.issue_id == issue_id, "wrong issue_id in round")
|
|
|
|
# 3. Start the round
|
|
log.info("starting round...")
|
|
_start_round(round_, client, queue, cfg, log)
|
|
|
|
check(round_.status == "running", f"status={round_.status!r} (want 'running')")
|
|
check(round_.phase == "awaiting_debaters",
|
|
f"phase={round_.phase!r} (want 'awaiting_debaters')")
|
|
check(bool(round_.coordinator_comment_id), "coordinator_comment_id not set")
|
|
|
|
# Verify comment on real issue
|
|
real_comments = client.list_comments(issue_id)
|
|
coord_comment = next(
|
|
(c for c in real_comments if c.get("id") == round_.coordinator_comment_id),
|
|
None,
|
|
)
|
|
check(coord_comment is not None, "coordinator comment not found in issue")
|
|
if coord_comment:
|
|
for name in DEBATER_NAMES:
|
|
check(name in coord_comment["content"],
|
|
f"debater '{name}' not mentioned in coordinator comment")
|
|
|
|
# 4. Simulate debater responses (content-marker fallback)
|
|
log.info("posting fake debater responses...")
|
|
for name in DEBATER_NAMES:
|
|
client.post_comment(
|
|
issue_id,
|
|
f"[{name} response]: Smoke-test evidence. "
|
|
f"This simulates {name}'s analysis.",
|
|
)
|
|
|
|
# 5. Advance to awaiting_judge
|
|
log.info("advancing past debaters...")
|
|
_advance_awaiting_debaters(round_, client, queue, cfg, log)
|
|
|
|
check(round_.phase == "awaiting_judge",
|
|
f"phase={round_.phase!r} (want 'awaiting_judge')")
|
|
check(bool(round_.judge_comment_id), "judge_comment_id not set")
|
|
|
|
real_comments = client.list_comments(issue_id)
|
|
judge_comment = next(
|
|
(c for c in real_comments if c.get("id") == round_.judge_comment_id),
|
|
None,
|
|
)
|
|
check(judge_comment is not None, "judge comment not found in issue")
|
|
if judge_comment:
|
|
check("Verdict requested" in judge_comment["content"],
|
|
"judge comment missing 'Verdict requested'")
|
|
|
|
# 6. Simulate judge verdict
|
|
log.info("posting fake judge verdict...")
|
|
client.post_comment(
|
|
issue_id,
|
|
"VERDICT: ACCEPT\n\nSmoke test passed. Implementation is complete.",
|
|
)
|
|
|
|
# 7. Advance past judge
|
|
log.info("processing verdict...")
|
|
_advance_awaiting_judge(round_, client, queue, cfg, log)
|
|
|
|
check(round_.status == "done", f"status={round_.status!r} (want 'done')")
|
|
check(round_.phase == "accepted", f"phase={round_.phase!r} (want 'accepted')")
|
|
|
|
refreshed = client.get_issue(issue_id)
|
|
check(refreshed.get("status") == "done",
|
|
f"issue status={refreshed.get('status')!r} (want 'done')")
|
|
|
|
# 8. Race-fix check: poll_once must not double-enqueue OUR issue.
|
|
# (Other in_review issues may also be enqueued — that's expected.)
|
|
rounds_before = len(queue.rounds)
|
|
poll_once(client, state, queue, log)
|
|
our_rounds_after = [
|
|
r for r in queue.rounds if r.issue_id == issue_id
|
|
]
|
|
check(
|
|
len(our_rounds_after) == 1,
|
|
f"double-enqueue: issue {issue_id} appears {len(our_rounds_after)} time(s) "
|
|
f"in queue (want 1)",
|
|
)
|
|
|
|
except Exception as exc:
|
|
log.exception("unexpected error: %s", exc)
|
|
failures.append(f"unexpected exception: {exc}")
|
|
finally:
|
|
if issue_id:
|
|
try:
|
|
client.update_issue_status(issue_id, "done")
|
|
log.info("cleanup: issue %s → done", issue_id)
|
|
except Exception as exc:
|
|
log.warning("cleanup failed: %s", exc)
|
|
|
|
if failures:
|
|
print(f"\nFAIL ({len(failures)} assertion(s)):")
|
|
for f in failures:
|
|
print(f" - {f}")
|
|
return 1
|
|
|
|
print("\nverdict OK")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|