Files
coordinator/tests/test_orchestrator.py
T
m-platform-admin 00ff80fbbb Port CEK's debate-round instructions verbatim; prevent sycophantic convergence
Live round on WYL-72 exposed that my debate-round comment produced
sycophancy, not debate.  Judge-Gemini moved from a 4.0 ACCEPT stance to
"I agree my initial score was too lenient... I have adjusted my scores
to 3s and 2s" after reading Judge-GPT's 2.92 report — without defending
its original scores or challenging GPT's evidence.  Classic social-pressure
convergence.

Root cause: my debate comment said "You may hold your position if you
have new evidence; you may move if you find the other reasoning more
grounded.  Do not split the difference to compromise."  That phrasing
is both weaker than CEK's intent AND it dropped every structural
anti-sycophancy instruction CEK spelled out in judge-with-debate/SKILL.md:

  Missing: "Identify disagreements (where your scores differ by >1 point)"
  Missing: "Defend your position with evidence from the specification"
  Missing: "Challenge the other judge's position with counter-evidence"
  Missing: "Only revise if you find their evidence compelling"
  Missing: "Defend your original scores if you still believe them"

Also: I asked judges to post a REVISED report (implicitly retracting
their prior position).  CEK asks them to APPEND a debate round section
to their prior report, keeping both visible so the revision is a change
ON TOP OF the original rather than a replacement.

Fixed by porting CEK's instruction block verbatim into _build_debate_round_comment.
Added a regression test that fails if any future edit removes these exact
clauses.

Tests: 72 passed (+1 regression test).
2026-04-18 22:39:30 +02:00

767 lines
28 KiB
Python

"""Tests for the CEK-native review pipeline (meta-judge + 3 judges + consensus)."""
from __future__ import annotations
import logging
import pathlib
from datetime import datetime, timezone
from typing import Any
import pytest
import yaml
from coordinator.queue import DebateQueue, Round
from coordinator.orchestrator import (
ACCEPT_MIN_SCORE,
CONSENSUS_CRITERION_THRESHOLD,
CONSENSUS_OVERALL_THRESHOLD,
JUDGE_NAMES,
MAX_DEBATE_ROUNDS,
META_JUDGE_NAME,
REWORK_INSTRUCTIONS,
_advance_awaiting_debate,
_advance_awaiting_judges,
_advance_awaiting_rubric,
_advance_round,
_apply_verdict,
_build_coordinator_note_no_agent,
_build_debate_round_comment,
_build_judge_mention_comment,
_build_meta_judge_mention,
_build_retrigger_comment,
_check_consensus,
_criterion_scores,
_extract_yaml,
_find_commit_url,
_find_reply_by_agent,
_overall_score,
_parse_judge_report,
_parse_rubric,
_post_rejection_retrigger,
_start_round,
_utcnow,
)
_logger = logging.getLogger("test.orchestrator")
# ---------------------------------------------------------------------------
# Fakes
# ---------------------------------------------------------------------------
class FakeConfig:
server_url = "http://x"
workspace_id = "wid"
token = "tok"
poll_interval_s = 30
round_timeout_s = 600
max_concurrent_rounds = 3
class FakeClient:
def __init__(self) -> None:
self.issue: dict[str, Any] = {
"id": "issue-1",
"title": "Do the thing",
"description": "Please do the thing clearly.",
"status": "in_review",
"assignee_type": None,
"assignee_id": None,
}
self.comments: list[dict[str, Any]] = []
self.posted_comments: list[str] = []
self.agents: list[dict[str, Any]] = [
{"name": META_JUDGE_NAME, "id": "agent-meta"},
{"name": "Judge-GPT", "id": "agent-gpt"},
{"name": "Judge-Claude", "id": "agent-claude"},
{"name": "Judge-Gemini", "id": "agent-gemini"},
]
self._next_comment_id = 1000
def get_issue(self, issue_id: str) -> dict[str, Any]:
return dict(self.issue)
def update_issue_status(self, issue_id: str, status: str) -> dict[str, Any]:
self.issue["status"] = status
return {"id": issue_id, "status": status}
def list_comments(self, issue_id: str) -> list[dict[str, Any]]:
return list(self.comments)
def post_comment(self, issue_id: str, content: str) -> dict[str, Any]:
self.posted_comments.append(content)
cid = f"posted-{self._next_comment_id}"
self._next_comment_id += 1
created = _utcnow()
self.comments.append({
"id": cid,
"content": content,
"author_id": "coord-user",
"created_at": created,
})
return {"id": cid, "created_at": created}
def list_agents(self) -> list[dict[str, Any]]:
return list(self.agents)
def find_agents_by_name(self, names):
want = set(names)
return {a["name"]: a["id"] for a in self.agents if a["name"] in want}
def get_agent_name(self, agent_id: str) -> str | None:
for a in self.agents:
if a["id"] == agent_id:
return a["name"]
return None
def _past(seconds_ago: int) -> str:
from datetime import timedelta
t = datetime.now(timezone.utc) - timedelta(seconds=seconds_ago)
return t.strftime("%Y-%m-%dT%H:%M:%SZ")
def _reply_comment(agent_id: str, content: str, created: str | None = None) -> dict[str, Any]:
return {
"id": f"reply-{agent_id}",
"author_id": agent_id,
"content": content,
"created_at": created or _utcnow(),
}
def _rubric_yaml_sample() -> str:
spec = {
"checklist": [
{"question": "Does the code compile?", "category": "hard_rule", "importance": "essential", "rationale": "basic"},
{"question": "Is documentation present?", "category": "principle", "importance": "important", "rationale": "quality"},
],
"rubric_dimensions": [
{"name": "Correctness", "description": "Does the code work", "scale": "1-5", "weight": 0.6, "score_definitions": {1: "no", 5: "perfect"}},
{"name": "Clarity", "description": "Readability", "scale": "1-5", "weight": 0.4, "score_definitions": {1: "opaque", 5: "crystal"}},
],
}
return yaml.safe_dump(spec, sort_keys=False)
def _judge_report(final_score: float, criteria: dict[str, float]) -> str:
report = {
"evaluation_report": {
"score_calculation": {"final_score": final_score},
"rubric_scores": [
{"name": k, "score": v, "weight": 0.5} for k, v in criteria.items()
],
"executive_summary": f"score {final_score}",
}
}
return "```yaml\n" + yaml.safe_dump(report, sort_keys=False) + "\n```"
def _make_round(tmp_path: pathlib.Path, **overrides) -> tuple[Round, DebateQueue]:
r = Round(
round_id="r1",
issue_id="issue-1",
identifier="WYL-X",
title="Do the thing",
enqueued_at=_utcnow(),
status="running",
phase="convened",
phase_entered_at=_utcnow(),
)
for k, v in overrides.items():
setattr(r, k, v)
q = DebateQueue.load(tmp_path / "queue.json")
q.rounds.append(r)
q.save()
return r, q
# ---------------------------------------------------------------------------
# Pure helpers
# ---------------------------------------------------------------------------
def test_extract_yaml_from_fenced_block():
content = "Here is the rubric:\n\n```yaml\nchecklist:\n - question: foo\n```\n\nDone."
y = _extract_yaml(content)
assert y.startswith("checklist:")
assert "question: foo" in y
def test_extract_yaml_from_unfenced_content():
content = "checklist:\n - question: foo"
y = _extract_yaml(content)
assert y == content.strip()
def test_extract_yaml_unescapes_html_entities():
# Multica REST API returns comment content with `"` as `"`, `>` as `>`, etc.
content = "checklist:\n - id: "CK-001"\n question: "does it work?""
y = _extract_yaml(content)
assert '"CK-001"' in y
assert """ not in y
def test_parse_rubric_accepts_html_encoded_input():
encoded = (
"rrd_cycle_applied: true\n"
"evaluation_specification:\n"
" checklist:\n"
" - id: "CK-001"\n"
" question: "does it work?"\n"
" category: "hard_rule"\n"
" importance: "essential"\n"
)
spec = _parse_rubric(encoded)
assert spec is not None
assert "checklist" in spec
def test_extract_yaml_repairs_backslash_backtick():
# Gemini (and similar) emit \` inside double-quoted YAML strings, imitating
# markdown escaping. \` is not a valid YAML escape, so we repair it.
content = "evaluation_report:\n rubric_scores:\n - name: X\n score: 4\n evidence: \"see \\`foo.py\\` and \\`bar.py\\`\"\n"
y = _extract_yaml(content)
assert "\\`" not in y
assert "`foo.py`" in y
def test_parse_judge_report_tolerates_backslash_backtick():
content = (
"```yaml\n"
"evaluation_report:\n"
" score_calculation:\n"
" final_score: 4.0\n"
" rubric_scores:\n"
" - name: Correctness\n"
" score: 4\n"
" weight: 1.0\n"
" evidence: \"see \\`foo.py\\`\"\n"
"```"
)
r = _parse_judge_report(content)
assert r is not None
assert r["score_calculation"]["final_score"] == 4.0
def test_parse_rubric_valid_flat():
spec = _parse_rubric(f"```yaml\n{_rubric_yaml_sample()}\n```")
assert spec is not None
assert "checklist" in spec or "rubric_dimensions" in spec
def test_parse_rubric_valid_wrapped():
wrapped = yaml.safe_dump({"evaluation_specification": yaml.safe_load(_rubric_yaml_sample())})
spec = _parse_rubric(f"```yaml\n{wrapped}\n```")
assert spec is not None
assert "rubric_dimensions" in spec
def test_parse_rubric_rejects_malformed_yaml():
assert _parse_rubric("```yaml\nnot: valid: nested: without: quotes\n```") is None
def test_parse_rubric_rejects_yaml_without_expected_keys():
assert _parse_rubric("```yaml\njust_some: random\n```") is None
def test_parse_judge_report_valid_with_final_score():
content = _judge_report(3.7, {"Correctness": 4.0, "Clarity": 3.5})
r = _parse_judge_report(content)
assert r is not None
assert r["score_calculation"]["final_score"] == 3.7
def test_parse_judge_report_valid_without_final_score_but_rubric_scores():
report = {
"evaluation_report": {
"rubric_scores": [
{"name": "Correctness", "score": 4.0, "weight": 1.0},
]
}
}
content = "```yaml\n" + yaml.safe_dump(report) + "\n```"
assert _parse_judge_report(content) is not None
def test_parse_judge_report_rejects_empty():
assert _parse_judge_report("```yaml\nnothing: here\n```") is None
def test_overall_score_prefers_final_score():
r = {"score_calculation": {"final_score": 2.8}, "rubric_scores": [{"name": "x", "score": 5, "weight": 1}]}
assert _overall_score(r) == 2.8
def test_overall_score_falls_back_to_weighted_average():
r = {"rubric_scores": [
{"name": "a", "score": 4.0, "weight": 0.6},
{"name": "b", "score": 2.0, "weight": 0.4},
]}
assert _overall_score(r) == pytest.approx(3.2)
def test_overall_score_none_when_nothing_to_extract():
assert _overall_score({}) is None
def test_criterion_scores_extracts_names_and_scores():
r = {"rubric_scores": [
{"name": "Correctness", "score": 4.0},
{"name": "Clarity", "score": 3.0},
]}
s = _criterion_scores(r)
assert s == {"Correctness": 4.0, "Clarity": 3.0}
def test_check_consensus_converged_accept():
reports = {
"Judge-GPT": {"score_calculation": {"final_score": 4.0}, "rubric_scores": [{"name": "C", "score": 4}]},
"Judge-Claude": {"score_calculation": {"final_score": 4.2}, "rubric_scores": [{"name": "C", "score": 4}]},
"Judge-Gemini": {"score_calculation": {"final_score": 4.1}, "rubric_scores": [{"name": "C", "score": 4}]},
}
converged, verdict, avg = _check_consensus(reports)
assert converged is True
assert verdict == "ACCEPT"
assert avg == pytest.approx((4.0 + 4.2 + 4.1) / 3)
def test_check_consensus_converged_reject_low_score():
reports = {
n: {"score_calculation": {"final_score": 2.5}, "rubric_scores": [{"name": "C", "score": 2}]}
for n in JUDGE_NAMES
}
converged, verdict, avg = _check_consensus(reports)
assert converged is True
assert verdict == "REJECT"
def test_check_consensus_not_converged_overall_spread():
reports = {
"Judge-GPT": {"score_calculation": {"final_score": 2.0}},
"Judge-Claude": {"score_calculation": {"final_score": 4.0}},
"Judge-Gemini": {"score_calculation": {"final_score": 3.0}},
}
converged, verdict, avg = _check_consensus(reports)
assert converged is False
assert verdict is None
assert avg == pytest.approx(3.0)
def test_check_consensus_not_converged_criterion_spread():
reports = {
"Judge-GPT": {"score_calculation": {"final_score": 3.0}, "rubric_scores": [{"name": "C", "score": 2}]},
"Judge-Claude": {"score_calculation": {"final_score": 3.1}, "rubric_scores": [{"name": "C", "score": 5}]},
"Judge-Gemini": {"score_calculation": {"final_score": 3.0}, "rubric_scores": [{"name": "C", "score": 3}]},
}
converged, _, _ = _check_consensus(reports)
assert converged is False
def test_check_consensus_no_overalls_returns_false():
reports = {n: {} for n in JUDGE_NAMES}
converged, verdict, avg = _check_consensus(reports)
assert converged is False
assert avg is None
# ---------------------------------------------------------------------------
# Comment builders
# ---------------------------------------------------------------------------
def test_meta_judge_mention_contains_mention_and_description():
body = _build_meta_judge_mention("agent-meta", "Title", "Description line 1\nDescription line 2")
assert "mention://agent/agent-meta" in body
assert META_JUDGE_NAME in body
assert "Description line 1" in body
assert "Description line 2" in body
def test_judge_mention_contains_all_three_mentions_and_rubric():
judge_ids = {"Judge-GPT": "a", "Judge-Claude": "b", "Judge-Gemini": "c"}
body = _build_judge_mention_comment(judge_ids, "Title", "Desc", "https://example.com/commit/abc", "rubric: yes")
for n in JUDGE_NAMES:
assert n in body
for agent_id in ("a", "b", "c"):
assert f"mention://agent/{agent_id}" in body
assert "rubric: yes" in body
assert "https://example.com/commit/abc" in body
def test_debate_round_comment_quotes_all_prior_reports():
judge_ids = {n: f"id-{n}" for n in JUDGE_NAMES}
prior = {n: f"REPORT FROM {n}" for n in JUDGE_NAMES}
body = _build_debate_round_comment(judge_ids, 1, prior)
assert "Debate round 1" in body
for n in JUDGE_NAMES:
assert n in body
assert f"REPORT FROM {n}" in body
def test_debate_round_comment_contains_cek_anti_sycophancy_language():
"""Regression: do not soften CEK's critical debate instructions.
The first live run produced sycophantic convergence because the earlier,
softer phrasing dropped CEK's explicit 'only revise if compelling / defend
original if you still believe them' instructions. Any future edit that
removes these exact clauses should be caught here.
"""
judge_ids = {n: f"id-{n}" for n in JUDGE_NAMES}
body = _build_debate_round_comment(judge_ids, 1, {n: "x" for n in JUDGE_NAMES})
# CEK's structural instructions
assert "Identify disagreements (where your scores differ by >1 point)" in body
assert "Defend your position with evidence" in body
assert "Challenge the other judge's position with counter-evidence" in body
# CEK's CRITICAL anti-sycophancy list
assert "Only revise if you find their evidence compelling." in body
assert "Defend your original scores if you still believe them." in body
# APPEND not REVISE
assert "APPENDS to your prior report" in body
assert "REVISED" not in body # the old softer phrasing is gone
def test_retrigger_comment_has_anchor_and_no_drift_instructions():
body = _build_retrigger_comment("Worker", "agent-worker", "Original desc line.", "VERDICT: REJECT", "r1")
assert "mention://agent/agent-worker" in body
assert "ANCHOR" in body
assert "Original desc line." in body
assert REWORK_INSTRUCTIONS in body
def test_coordinator_note_no_agent_has_no_mention():
body = _build_coordinator_note_no_agent("r1", "no assignee set")
assert "mention://" not in body
assert "Manual follow-up required" in body
# ---------------------------------------------------------------------------
# _find_commit_url
# ---------------------------------------------------------------------------
def test_find_commit_url_picks_latest():
comments = [
{"content": "older https://git.example/commit/abc123", "created_at": "2026-01-01T00:00:00Z"},
{"content": "newer https://git.example/commit/def456", "created_at": "2026-01-02T00:00:00Z"},
]
assert _find_commit_url(comments) == "https://git.example/commit/def456"
def test_find_commit_url_empty_when_absent():
assert _find_commit_url([{"content": "no urls"}]) == ""
# ---------------------------------------------------------------------------
# _find_reply_by_agent
# ---------------------------------------------------------------------------
def test_find_reply_by_agent_respects_cutoff():
comments = [
{"id": "before", "author_id": "agent-x", "content": "early", "created_at": _past(100)},
{"id": "cutoff", "author_id": "coord", "content": "mention", "created_at": _past(50)},
{"id": "after", "author_id": "agent-x", "content": "late", "created_at": _past(10)},
]
found = _find_reply_by_agent(comments, "agent-x", "cutoff")
assert found == ("after", "late")
def test_find_reply_by_agent_none_when_no_match():
comments = [{"id": "x", "author_id": "other", "content": "hi", "created_at": _utcnow()}]
assert _find_reply_by_agent(comments, "agent-x", "somewhere") is None
# ---------------------------------------------------------------------------
# _start_round
# ---------------------------------------------------------------------------
def test_start_round_posts_meta_judge_mention_and_sets_phase(tmp_path):
r, q = _make_round(tmp_path, phase="convened", status="pending")
client = FakeClient()
_start_round(r, client, q, FakeConfig(), _logger)
assert r.status == "running"
assert len(client.posted_comments) == 1
body = client.posted_comments[0]
assert "mention://agent/agent-meta" in body
assert r.phase == "awaiting_rubric"
assert r.meta_judge_comment_id != ""
def test_start_round_marks_error_on_api_failure(tmp_path):
r, q = _make_round(tmp_path, phase="convened", status="pending")
class FailingClient(FakeClient):
def get_issue(self, issue_id):
raise RuntimeError("no issue")
client = FailingClient()
_start_round(r, client, q, FakeConfig(), _logger)
assert r.status == "error"
# ---------------------------------------------------------------------------
# _advance_awaiting_rubric
# ---------------------------------------------------------------------------
def test_advance_awaiting_rubric_waits_when_meta_judge_silent(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_rubric", status="running",
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
)
client = FakeClient()
client.comments = [{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)}]
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
assert r.phase == "awaiting_rubric"
assert r.status == "running"
def test_advance_awaiting_rubric_errors_on_malformed_yaml(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_rubric", status="running",
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
)
client = FakeClient()
client.comments = [
{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
{"id": "reply", "author_id": "agent-meta", "content": "```yaml\nnot: valid: nested\n```", "created_at": _utcnow()},
]
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
assert r.phase == "error"
assert r.status == "error"
def test_advance_awaiting_rubric_moves_to_awaiting_judges_on_valid_rubric(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_rubric", status="running",
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
)
client = FakeClient()
client.comments = [
{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
{"id": "reply", "author_id": "agent-meta", "content": f"```yaml\n{_rubric_yaml_sample()}\n```", "created_at": _utcnow()},
]
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
assert r.phase == "awaiting_judges"
assert r.rubric_yaml != ""
assert any("mention://agent/agent-gpt" in c for c in client.posted_comments)
assert any("mention://agent/agent-claude" in c for c in client.posted_comments)
assert any("mention://agent/agent-gemini" in c for c in client.posted_comments)
def test_advance_awaiting_rubric_errors_on_timeout_without_reply(tmp_path):
cfg = FakeConfig()
r, q = _make_round(
tmp_path, phase="awaiting_rubric", status="running",
meta_judge_comment_id="meta-c", phase_entered_at=_past(cfg.round_timeout_s + 5),
)
client = FakeClient()
client.comments = [{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(cfg.round_timeout_s + 5)}]
_advance_awaiting_rubric(r, client, q, cfg, _logger)
assert r.phase == "error"
assert r.status == "error"
# ---------------------------------------------------------------------------
# _advance_awaiting_judges
# ---------------------------------------------------------------------------
def test_advance_awaiting_judges_waits_when_missing_reports(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_judges", status="running",
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
)
client = FakeClient()
client.comments = [
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
_reply_comment("agent-gpt", _judge_report(3.5, {"Correctness": 4})),
]
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
assert r.phase == "awaiting_judges"
def test_advance_awaiting_judges_accepts_on_consensus(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_judges", status="running",
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
)
client = FakeClient()
client.comments = [
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
_reply_comment("agent-gpt", _judge_report(4.0, {"Correctness": 4})),
_reply_comment("agent-claude", _judge_report(4.1, {"Correctness": 4})),
_reply_comment("agent-gemini", _judge_report(4.2, {"Correctness": 4})),
]
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
assert r.phase == "accepted"
assert r.status == "done"
assert client.issue["status"] == "done"
def test_advance_awaiting_judges_rejects_on_consensus_low(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_judges", status="running",
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
)
client = FakeClient()
client.issue["assignee_type"] = "agent"
client.issue["assignee_id"] = "agent-worker"
client.agents.append({"name": "Worker", "id": "agent-worker"})
client.comments = [
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
_reply_comment("agent-claude", _judge_report(2.2, {"Correctness": 2})),
_reply_comment("agent-gemini", _judge_report(2.1, {"Correctness": 2})),
]
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
assert r.phase == "rejected"
assert r.status == "done"
assert client.issue["status"] == "in_progress"
assert any("mention://agent/agent-worker" in c for c in client.posted_comments)
def test_advance_awaiting_judges_starts_debate_round_when_spread(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_judges", status="running",
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(), debate_round=0,
)
client = FakeClient()
client.comments = [
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
_reply_comment("agent-claude", _judge_report(4.0, {"Correctness": 4})),
_reply_comment("agent-gemini", _judge_report(3.0, {"Correctness": 3})),
]
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
assert r.phase == "awaiting_debate"
assert r.debate_round == 1
def test_advance_awaiting_judges_errors_when_no_parseable(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_judges", status="running",
judge_mention_comment_id="jm-c", phase_entered_at=_past(FakeConfig.round_timeout_s + 5),
)
client = FakeClient()
client.comments = [
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(FakeConfig.round_timeout_s + 5)},
_reply_comment("agent-gpt", "not yaml at all"),
_reply_comment("agent-claude", "also not yaml"),
_reply_comment("agent-gemini", "garbage"),
]
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
assert r.phase == "error"
assert r.status == "error"
# ---------------------------------------------------------------------------
# Debate round cap
# ---------------------------------------------------------------------------
def test_awaiting_debate_errors_out_at_cap(tmp_path):
r, q = _make_round(
tmp_path, phase="awaiting_debate", status="running",
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
debate_round=MAX_DEBATE_ROUNDS,
)
client = FakeClient()
client.comments = [
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
_reply_comment("agent-claude", _judge_report(4.0, {"Correctness": 4})),
_reply_comment("agent-gemini", _judge_report(3.0, {"Correctness": 3})),
]
_advance_awaiting_debate(r, client, q, FakeConfig(), _logger)
assert r.phase == "error"
assert r.status == "error"
# ---------------------------------------------------------------------------
# Race fix: issue status moves BEFORE round marked done
# ---------------------------------------------------------------------------
def test_apply_verdict_updates_issue_before_marking_round_done(tmp_path):
call_order: list[str] = []
class TrackingClient(FakeClient):
def update_issue_status(self, issue_id, status):
call_order.append(f"issue:{status}")
return super().update_issue_status(issue_id, status)
class TrackingQueue(DebateQueue):
def update_status(self, round_id, status):
call_order.append(f"round:{status}")
super().update_status(round_id, status)
r = Round(
round_id="r1", issue_id="issue-1", identifier="WYL-X", title="t",
enqueued_at=_utcnow(), status="running", phase="awaiting_judges",
phase_entered_at=_utcnow(),
)
q = TrackingQueue.load(tmp_path / "queue.json")
q.rounds.append(r)
q.save()
client = TrackingClient()
_apply_verdict(r, client, q, "ACCEPT", "VERDICT: ACCEPT\nScore 4.0", _logger)
assert "issue:done" in call_order
assert "round:done" in call_order
assert call_order.index("issue:done") < call_order.index("round:done")
# ---------------------------------------------------------------------------
# _post_rejection_retrigger corner cases
# ---------------------------------------------------------------------------
def test_reject_retrigger_skipped_for_member_assignee(tmp_path):
r, q = _make_round(tmp_path)
client = FakeClient()
client.issue["assignee_type"] = "member"
client.issue["assignee_id"] = "user-1"
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
assert cid is None
assert len(client.posted_comments) == 1
assert "mention://" not in client.posted_comments[0]
assert "Manual follow-up" in client.posted_comments[0]
def test_reject_retrigger_skipped_when_no_assignee(tmp_path):
r, q = _make_round(tmp_path)
client = FakeClient()
client.issue["assignee_type"] = None
client.issue["assignee_id"] = None
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
assert cid is None
def test_reject_retrigger_includes_verbatim_description(tmp_path):
r, q = _make_round(tmp_path)
client = FakeClient()
client.issue["assignee_type"] = "agent"
client.issue["assignee_id"] = "agent-worker"
client.agents.append({"name": "Worker", "id": "agent-worker"})
desc = "Line A.\n\nLine B with unique-marker-123.\n\nLine C."
client.issue["description"] = desc
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
assert cid is not None
body = client.posted_comments[-1]
for line in desc.splitlines():
if line.strip():
assert line in body
assert REWORK_INSTRUCTIONS in body
# ---------------------------------------------------------------------------
# _advance_round dispatch
# ---------------------------------------------------------------------------
def test_advance_round_dispatches_convened_to_start(tmp_path):
r, q = _make_round(tmp_path, phase="convened", status="pending")
client = FakeClient()
_advance_round(r, client, q, FakeConfig(), _logger)
assert r.phase == "awaiting_rubric"
def test_advance_round_corrects_terminal_phase_with_wrong_status(tmp_path):
r, q = _make_round(tmp_path, phase="accepted", status="running")
client = FakeClient()
_advance_round(r, client, q, FakeConfig(), _logger)
assert r.status == "done"