Files
coordinator/tests/test_orchestrator.py
T
m-senior-developer 0e44846032 Implement debate round orchestration (WYL-45)
New module: src/coordinator/orchestrator.py
- DEBATER_NAMES, JUDGE_NAME, DEBATER_PROMPTS, JUDGE_PROMPT_TEMPLATE hardcoded for v1
- Per-debater prompts tell each debater exactly which tool output to ground evidence in
- orchestrate_pending() is the main entry point called from watch_loop
- _start_round(): pending→running, posts debater mention comment, phase→awaiting_debaters
- _advance_awaiting_debaters(): polls for replies, handles timeout with partial evidence,
  posts judge comment, phase→awaiting_judge
- _advance_awaiting_judge(): polls for verdict; RACE FIX — update_issue_status() called
  BEFORE queue.update_status("done") so poll_once can never double-enqueue
- Detection: primary=author_id match, fallback=[{name} response]: content marker (enables tests)
- Restart-safe: phase field persisted on every mutation; in-flight rounds resume correctly

Extended src/coordinator/queue.py:
- Round gains phase, phase_entered_at, coordinator_comment_id, judge_comment_id fields
- DebateQueue.update_phase() and running() added
- All new fields default-empty so existing queue.json files load cleanly

Extended src/coordinator/multica_client.py:
- update_issue_status() convenience wrapper
- create_issue() for integration / smoke tests

Updated src/coordinator/__main__.py:
- _orchestrate_pending stub replaced with real import from orchestrator

Tests:
- tests/test_orchestrator.py: 32 new unit tests covering phase transitions, timeouts,
  race fix ordering, restart resume, full lifecycle
- tests/test_integration.py: @pytest.mark.integration test against real API
- smoke_test.py: standalone end-to-end script; ran against real API, verdict OK

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 21:43:17 +00:00

737 lines
24 KiB
Python

"""Unit tests for coordinator.orchestrator (WYL-45)."""
from __future__ import annotations
import logging
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import Any
import pytest
from coordinator.orchestrator import (
DEBATER_NAMES,
JUDGE_NAME,
_advance_awaiting_debaters,
_advance_awaiting_judge,
_collect_debater_replies,
_collect_judge_reply,
_find_commit_url,
_parse_verdict,
_start_round,
orchestrate_pending,
)
from coordinator.queue import DebateQueue, Round
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _utcnow() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def _past(seconds: float) -> str:
t = datetime.now(timezone.utc) - timedelta(seconds=seconds)
return t.strftime("%Y-%m-%dT%H:%M:%SZ")
@dataclass
class FakeConfig:
server_url: str = "http://fake"
workspace_id: str = "ws-1"
token: str = "tok"
poll_interval_s: int = 30
round_timeout_s: int = 600
max_concurrent_rounds: int = 3
seen_file: Path = Path("/tmp/seen.json")
queue_file: Path = Path("/tmp/queue.json")
log_file: Path = Path("/tmp/coordinator.log")
@dataclass
class FakeClient:
"""Controllable stand-in for MulticaClient."""
issue: dict[str, Any] = field(default_factory=lambda: {
"id": "issue-1",
"title": "Test issue",
"description": "Test description. Commit: https://git.example.com/commit/abc123",
"status": "in_review",
})
comments: list[dict[str, Any]] = field(default_factory=list)
agents: list[dict[str, Any]] = field(default_factory=list)
posted_comments: list[str] = field(default_factory=list)
status_updates: list[tuple[str, str]] = field(default_factory=list)
post_comment_returns_id: str = "comment-coord"
def get_issue(self, issue_id: str) -> dict[str, Any]:
return self.issue
def list_comments(self, issue_id: str) -> list[dict[str, Any]]:
return list(self.comments)
def post_comment(self, issue_id: str, content: str) -> dict[str, Any]:
self.posted_comments.append(content)
cid = self.post_comment_returns_id
# Advance the ID for subsequent calls
self.post_comment_returns_id = "comment-" + str(len(self.posted_comments))
ts = _utcnow()
new_comment = {"id": cid, "content": content, "created_at": ts,
"author_id": "coord", "author_type": "member"}
self.comments.append(new_comment)
return new_comment
def list_agents(self) -> list[dict[str, Any]]:
return list(self.agents)
def find_agents_by_name(self, names) -> dict[str, str]:
wanted = set(names)
return {a["name"]: a["id"] for a in self.agents if a["name"] in wanted}
def update_issue_status(self, issue_id: str, status: str) -> dict[str, Any]:
self.status_updates.append((issue_id, status))
self.issue["status"] = status
return self.issue
def update_issue(self, issue_id: str, **fields: Any) -> dict[str, Any]:
self.issue.update(fields)
return self.issue
def _make_round(
*,
status: str = "pending",
phase: str = "convened",
phase_entered_at: str = "",
coordinator_comment_id: str = "",
judge_comment_id: str = "",
) -> Round:
return Round(
round_id=str(uuid.uuid4()),
issue_id="issue-1",
identifier="WYL-99",
title="Test issue",
enqueued_at=_utcnow(),
status=status,
phase=phase,
phase_entered_at=phase_entered_at or _utcnow(),
coordinator_comment_id=coordinator_comment_id,
judge_comment_id=judge_comment_id,
)
def _make_queue(tmp_path: Path, *rounds: Round) -> DebateQueue:
q = DebateQueue.load(tmp_path / "queue.json")
for r in rounds:
q.rounds.append(r)
q.save()
return q
_logger = logging.getLogger("test.orchestrator")
# ---------------------------------------------------------------------------
# _parse_verdict
# ---------------------------------------------------------------------------
def test_parse_verdict_accept():
assert _parse_verdict("VERDICT: ACCEPT\n\nGreat work.") == "ACCEPT"
def test_parse_verdict_reject():
assert _parse_verdict("VERDICT: REJECT\n\nMissing tests.") == "REJECT"
def test_parse_verdict_case_insensitive():
assert _parse_verdict("verdict: accept") == "ACCEPT"
def test_parse_verdict_none_when_absent():
assert _parse_verdict("No verdict here.") is None
# ---------------------------------------------------------------------------
# _find_commit_url
# ---------------------------------------------------------------------------
def test_find_commit_url_found():
comments = [
{"content": "Watcher implemented. Commit: https://git.example.com/multica/foo/commit/abc123"},
]
assert _find_commit_url(comments) == "https://git.example.com/multica/foo/commit/abc123"
def test_find_commit_url_returns_last():
comments = [
{"content": "Commit: https://git.example.com/multica/foo/commit/aaa111"},
{"content": "Follow-up commit: https://git.example.com/multica/foo/commit/bbb222"},
]
assert _find_commit_url(comments) == "https://git.example.com/multica/foo/commit/bbb222"
def test_find_commit_url_empty_when_absent():
assert _find_commit_url([{"content": "No link here."}]) == ""
# ---------------------------------------------------------------------------
# _collect_debater_replies
# ---------------------------------------------------------------------------
def _make_comment(
cid: str,
content: str,
author_id: str = "anon",
ts: str = "",
) -> dict[str, Any]:
return {
"id": cid,
"content": content,
"author_id": author_id,
"author_type": "agent",
"created_at": ts or _utcnow(),
}
def test_collect_replies_by_agent_id():
cutoff_ts = _past(10)
agent_map = {"Senior Developer": "agent-sd"}
comments = [
{"id": "coord", "content": "Debate opened", "author_id": "coord",
"author_type": "member", "created_at": cutoff_ts},
_make_comment("c1", "Evidence here.", author_id="agent-sd",
ts=_utcnow()),
]
replies = _collect_debater_replies(comments, agent_map, "coord")
assert "Senior Developer" in replies
def test_collect_replies_content_fallback():
"""Content-based marker accepted when agent ID not in map."""
cutoff_ts = _past(10)
agent_map = {}
comments = [
{"id": "coord", "content": "Debate opened", "author_id": "coord",
"author_type": "member", "created_at": cutoff_ts},
_make_comment(
"c1",
"[Senior Developer response]: grep output here.",
author_id="someone-else",
ts=_utcnow(),
),
]
replies = _collect_debater_replies(comments, agent_map, "coord")
assert "Senior Developer" in replies
def test_collect_replies_skips_before_cutoff():
"""Comments before coordinator's mention are ignored."""
early = _past(20)
cutoff_ts = _past(10)
agent_map = {"Senior Developer": "agent-sd"}
comments = [
_make_comment("early", "Early reply", author_id="agent-sd", ts=early),
{"id": "coord", "content": "Debate opened", "author_id": "coord",
"author_type": "member", "created_at": cutoff_ts},
]
replies = _collect_debater_replies(comments, agent_map, "coord")
assert "Senior Developer" not in replies
def test_collect_replies_no_duplicate_per_debater():
cutoff_ts = _past(10)
agent_map = {"Senior Developer": "agent-sd"}
comments = [
{"id": "coord", "content": "x", "author_id": "coord",
"author_type": "member", "created_at": cutoff_ts},
_make_comment("c1", "First reply", author_id="agent-sd", ts=_utcnow()),
_make_comment("c2", "Second reply", author_id="agent-sd", ts=_utcnow()),
]
replies = _collect_debater_replies(comments, agent_map, "coord")
assert replies["Senior Developer"] == "First reply"
# ---------------------------------------------------------------------------
# _collect_judge_reply
# ---------------------------------------------------------------------------
def test_collect_judge_reply_by_agent_id():
cutoff_ts = _past(10)
comments = [
{"id": "jc", "content": "Verdict requested", "author_id": "coord",
"created_at": cutoff_ts},
_make_comment("j1", "VERDICT: ACCEPT\n\nGreat.", author_id="agent-judge",
ts=_utcnow()),
]
result = _collect_judge_reply(comments, "agent-judge", "jc")
assert result is not None
assert "ACCEPT" in result
def test_collect_judge_reply_content_fallback():
cutoff_ts = _past(10)
comments = [
{"id": "jc", "content": "x", "author_id": "coord", "created_at": cutoff_ts},
_make_comment("j1", "VERDICT: REJECT\n\nMissing tests.", author_id="anyone",
ts=_utcnow()),
]
result = _collect_judge_reply(comments, "", "jc")
assert result is not None
assert "REJECT" in result
def test_collect_judge_reply_none_when_absent():
cutoff_ts = _past(10)
comments = [
{"id": "jc", "content": "x", "author_id": "coord", "created_at": cutoff_ts},
]
assert _collect_judge_reply(comments, "agent-judge", "jc") is None
# ---------------------------------------------------------------------------
# _start_round
# ---------------------------------------------------------------------------
def test_start_round_marks_running_before_post(tmp_path):
"""Status must be 'running' before any API call (early guard against double-enqueue)."""
call_order: list[str] = []
class TrackingQueue(DebateQueue):
def update_status(self, round_id, status):
call_order.append(f"update_status:{status}")
super().update_status(round_id, status)
class TrackingClient(FakeClient):
def post_comment(self, issue_id, content):
call_order.append("post_comment")
return super().post_comment(issue_id, content)
r = _make_round()
q = TrackingQueue.load(tmp_path / "queue.json")
q.rounds.append(r)
q.save()
_start_round(r, TrackingClient(), q, FakeConfig(), _logger)
# update_status("running") must precede post_comment
running_idx = next(i for i, v in enumerate(call_order) if v == "update_status:running")
post_idx = next(i for i, v in enumerate(call_order) if v == "post_comment")
assert running_idx < post_idx
def test_start_round_posts_debater_comment(tmp_path):
r = _make_round()
q = _make_queue(tmp_path, r)
client = FakeClient()
_start_round(r, client, q, FakeConfig(), _logger)
assert len(client.posted_comments) == 1
comment = client.posted_comments[0]
assert "Debate round opened" in comment
for name in DEBATER_NAMES:
assert name in comment
def test_start_round_sets_phase_awaiting_debaters(tmp_path):
r = _make_round()
q = _make_queue(tmp_path, r)
_start_round(r, FakeClient(), q, FakeConfig(), _logger)
assert r.phase == "awaiting_debaters"
assert r.status == "running"
assert r.coordinator_comment_id # must be set
def test_start_round_error_on_api_failure(tmp_path):
class BrokenClient(FakeClient):
def post_comment(self, issue_id, content):
raise RuntimeError("API down")
r = _make_round()
q = _make_queue(tmp_path, r)
_start_round(r, BrokenClient(), q, FakeConfig(), _logger)
assert r.status == "error"
# ---------------------------------------------------------------------------
# _advance_awaiting_debaters — debater replies
# ---------------------------------------------------------------------------
def _debater_round(tmp_path: Path) -> tuple[Round, DebateQueue, FakeClient]:
"""Return a round already in awaiting_debaters phase."""
r = _make_round(
status="running",
phase="awaiting_debaters",
phase_entered_at=_utcnow(),
coordinator_comment_id="coord-comment",
)
q = _make_queue(tmp_path, r)
client = FakeClient()
# Seed coordinator comment in the comment list
client.comments.append({
"id": "coord-comment",
"content": "Debate opened",
"author_id": "coord",
"author_type": "member",
"created_at": _past(5),
})
# Seed debater agents
client.agents = [{"name": name, "id": f"agent-{i}"} for i, name in enumerate(DEBATER_NAMES)]
client.agents.append({"name": JUDGE_NAME, "id": "agent-judge"})
return r, q, client
def test_advance_debaters_waits_when_not_all_replied(tmp_path):
r, q, client = _debater_round(tmp_path)
_advance_awaiting_debaters(r, client, q, FakeConfig(), _logger)
# Not enough replies → no judge comment posted
assert r.phase == "awaiting_debaters"
assert all("Verdict requested" not in c for c in client.posted_comments)
def test_advance_debaters_proceeds_when_all_replied(tmp_path):
r, q, client = _debater_round(tmp_path)
# Add a reply from each debater
for i, name in enumerate(DEBATER_NAMES):
client.comments.append({
"id": f"reply-{i}",
"content": f"[{name} response]: Evidence here.",
"author_id": f"agent-{i}",
"author_type": "agent",
"created_at": _utcnow(),
})
_advance_awaiting_debaters(r, client, q, FakeConfig(), _logger)
assert r.phase == "awaiting_judge"
assert r.judge_comment_id
judge_comment = client.posted_comments[-1]
assert "Verdict requested" in judge_comment
for name in DEBATER_NAMES:
assert name in judge_comment
def test_advance_debaters_timeout_with_partial_evidence(tmp_path):
"""After timeout, proceed with partial evidence (missing debaters noted in transcript)."""
r, q, client = _debater_round(tmp_path)
# Only one debater replies
client.comments.append({
"id": "r1",
"content": f"[{DEBATER_NAMES[0]} response]: My evidence.",
"author_id": "agent-0",
"author_type": "agent",
"created_at": _utcnow(),
})
# Simulate timeout by setting phase_entered_at far in the past
r.phase_entered_at = _past(700)
q.save()
cfg = FakeConfig()
_advance_awaiting_debaters(r, client, q, cfg, _logger)
assert r.phase == "awaiting_judge"
judge_comment = client.posted_comments[-1]
assert "timed out" in judge_comment or "no response" in judge_comment
# ---------------------------------------------------------------------------
# _advance_awaiting_judge — verdict handling
# ---------------------------------------------------------------------------
def _judge_round(tmp_path: Path, *, phase_entered_at: str = "") -> tuple[Round, DebateQueue, FakeClient]:
r = _make_round(
status="running",
phase="awaiting_judge",
phase_entered_at=phase_entered_at or _utcnow(),
coordinator_comment_id="coord-comment",
judge_comment_id="judge-comment",
)
q = _make_queue(tmp_path, r)
client = FakeClient()
client.agents = [{"name": JUDGE_NAME, "id": "agent-judge"}]
client.comments = [
{"id": "judge-comment", "content": "Verdict requested",
"author_id": "coord", "author_type": "member", "created_at": _past(5)},
]
return r, q, client
def test_advance_judge_waits_when_no_verdict(tmp_path):
r, q, client = _judge_round(tmp_path)
_advance_awaiting_judge(r, client, q, FakeConfig(), _logger)
assert r.phase == "awaiting_judge"
assert r.status == "running"
assert not client.status_updates
def test_advance_judge_accept_updates_issue_status_to_done(tmp_path):
r, q, client = _judge_round(tmp_path)
client.comments.append({
"id": "verdict1",
"content": "VERDICT: ACCEPT\n\nLooks good.",
"author_id": "agent-judge",
"author_type": "agent",
"created_at": _utcnow(),
})
_advance_awaiting_judge(r, client, q, FakeConfig(), _logger)
assert r.status == "done"
assert r.phase == "accepted"
assert client.status_updates == [("issue-1", "done")]
def test_advance_judge_reject_updates_issue_status_to_in_progress(tmp_path):
r, q, client = _judge_round(tmp_path)
client.comments.append({
"id": "verdict1",
"content": "VERDICT: REJECT\n\nMissing tests.",
"author_id": "agent-judge",
"author_type": "agent",
"created_at": _utcnow(),
})
_advance_awaiting_judge(r, client, q, FakeConfig(), _logger)
assert r.status == "done"
assert r.phase == "rejected"
assert client.status_updates == [("issue-1", "in_progress")]
def test_advance_judge_timeout_marks_error(tmp_path):
"""Judge timeout: round → error, issue left in_review for human escalation."""
r, q, client = _judge_round(tmp_path, phase_entered_at=_past(700))
_advance_awaiting_judge(r, client, q, FakeConfig(), _logger)
assert r.status == "error"
assert r.phase == "error"
# Issue status must NOT be changed — leave in_review for humans
assert not client.status_updates
# ---------------------------------------------------------------------------
# Race condition: issue status before round done (CRITICAL)
# ---------------------------------------------------------------------------
def test_issue_status_updated_before_round_marked_done(tmp_path):
"""RACE FIX: client.update_issue_status MUST precede queue.update_status('done')."""
call_order: list[str] = []
class TrackingClient(FakeClient):
def update_issue_status(self, issue_id, status):
call_order.append(f"issue:{status}")
return super().update_issue_status(issue_id, status)
class TrackingQueue(DebateQueue):
def update_status(self, round_id, status):
call_order.append(f"round:{status}")
super().update_status(round_id, status)
r = _make_round(
status="running",
phase="awaiting_judge",
phase_entered_at=_utcnow(),
coordinator_comment_id="coord-comment",
judge_comment_id="judge-comment",
)
q = TrackingQueue.load(tmp_path / "queue.json")
q.rounds.append(r)
q.save()
client = TrackingClient()
client.agents = [{"name": JUDGE_NAME, "id": "agent-judge"}]
client.comments = [
{"id": "judge-comment", "content": "x", "author_id": "coord",
"created_at": _past(5)},
{"id": "v1", "content": "VERDICT: ACCEPT\n\nAll good.",
"author_id": "agent-judge", "author_type": "agent", "created_at": _utcnow()},
]
_advance_awaiting_judge(r, client, q, FakeConfig(), _logger)
# Both calls must have happened
assert "issue:done" in call_order
assert "round:done" in call_order
# Issue status MUST precede round-done
assert call_order.index("issue:done") < call_order.index("round:done"), (
f"Expected issue:done before round:done, got order: {call_order}"
)
def test_round_not_marked_done_if_issue_update_fails(tmp_path):
"""If issue status update fails, don't mark round done (retry next cycle)."""
class FailingClient(FakeClient):
def update_issue_status(self, issue_id, status):
raise RuntimeError("network error")
r = _make_round(
status="running",
phase="awaiting_judge",
phase_entered_at=_utcnow(),
coordinator_comment_id="coord-comment",
judge_comment_id="judge-comment",
)
q = _make_queue(tmp_path, r)
client = FailingClient()
client.agents = [{"name": JUDGE_NAME, "id": "agent-judge"}]
client.comments = [
{"id": "judge-comment", "content": "x", "author_id": "coord", "created_at": _past(5)},
{"id": "v1", "content": "VERDICT: ACCEPT", "author_id": "agent-judge",
"created_at": _utcnow()},
]
_advance_awaiting_judge(r, client, q, FakeConfig(), _logger)
# Round must remain running for retry
assert r.status == "running"
# ---------------------------------------------------------------------------
# Phase transitions: full cycle
# ---------------------------------------------------------------------------
def test_full_phase_cycle(tmp_path):
"""pending → running/awaiting_debaters → awaiting_judge → accepted/done."""
r = _make_round()
q = _make_queue(tmp_path, r)
# Step 1: start_round
client = FakeClient()
client.agents = [
*[{"name": n, "id": f"agent-{i}"} for i, n in enumerate(DEBATER_NAMES)],
{"name": JUDGE_NAME, "id": "agent-judge"},
]
_start_round(r, client, q, FakeConfig(), _logger)
assert r.status == "running"
assert r.phase == "awaiting_debaters"
coord_cid = r.coordinator_comment_id
# Step 2: debaters reply
for i, name in enumerate(DEBATER_NAMES):
client.comments.append({
"id": f"reply-{i}", "content": f"[{name} response]: Evidence.",
"author_id": f"agent-{i}", "author_type": "agent", "created_at": _utcnow(),
})
_advance_awaiting_debaters(r, client, q, FakeConfig(), _logger)
assert r.phase == "awaiting_judge"
judge_cid = r.judge_comment_id
# Step 3: judge replies
client.comments.append({
"id": "verdict1", "content": "VERDICT: ACCEPT\n\nShipped.",
"author_id": "agent-judge", "author_type": "agent", "created_at": _utcnow(),
})
_advance_awaiting_judge(r, client, q, FakeConfig(), _logger)
assert r.phase == "accepted"
assert r.status == "done"
assert client.issue["status"] == "done"
# ---------------------------------------------------------------------------
# Restart resume: in-flight rounds resume from correct phase
# ---------------------------------------------------------------------------
def test_restart_resumes_awaiting_debaters(tmp_path):
"""On restart, a running/awaiting_debaters round picks up without re-posting comment."""
r = _make_round(
status="running",
phase="awaiting_debaters",
phase_entered_at=_utcnow(),
coordinator_comment_id="existing-coord-comment",
)
q = _make_queue(tmp_path, r)
client = FakeClient()
client.comments = [
{"id": "existing-coord-comment", "content": "Debate opened",
"author_id": "coord", "created_at": _past(60)},
]
orchestrate_pending(q, FakeConfig(), _logger, client=client)
# Must NOT post another debater comment
assert all("Debate round opened" not in c for c in client.posted_comments)
# Phase should still be awaiting_debaters (no replies)
assert r.phase == "awaiting_debaters"
def test_restart_resumes_awaiting_judge(tmp_path):
"""On restart, a running/awaiting_judge round resumes without re-posting judge comment."""
r = _make_round(
status="running",
phase="awaiting_judge",
phase_entered_at=_utcnow(),
coordinator_comment_id="coord-c",
judge_comment_id="judge-c",
)
q = _make_queue(tmp_path, r)
client = FakeClient()
client.comments = [
{"id": "judge-c", "content": "Verdict requested",
"author_id": "coord", "created_at": _past(30)},
]
orchestrate_pending(q, FakeConfig(), _logger, client=client)
# Must NOT post another judge comment
assert all("Verdict requested" not in c for c in client.posted_comments)
assert r.phase == "awaiting_judge"
# ---------------------------------------------------------------------------
# orchestrate_pending: pending rounds are started, running rounds advanced
# ---------------------------------------------------------------------------
def test_orchestrate_pending_starts_pending(tmp_path):
r = _make_round()
q = _make_queue(tmp_path, r)
client = FakeClient()
orchestrate_pending(q, FakeConfig(), _logger, client=client)
assert r.status == "running"
assert any("Debate round opened" in c for c in client.posted_comments)
def test_orchestrate_pending_advances_running(tmp_path):
"""Running/awaiting_judge round with a verdict is completed."""
r = _make_round(
status="running",
phase="awaiting_judge",
phase_entered_at=_utcnow(),
coordinator_comment_id="coord-c",
judge_comment_id="judge-c",
)
q = _make_queue(tmp_path, r)
client = FakeClient()
client.agents = [{"name": JUDGE_NAME, "id": "agent-judge"}]
client.comments = [
{"id": "judge-c", "content": "x", "author_id": "coord", "created_at": _past(10)},
{"id": "v1", "content": "VERDICT: REJECT\n\nNeeds work.",
"author_id": "agent-judge", "created_at": _utcnow()},
]
orchestrate_pending(q, FakeConfig(), _logger, client=client)
assert r.status == "done"
assert r.phase == "rejected"