00ff80fbbb
Live round on WYL-72 exposed that my debate-round comment produced sycophancy, not debate. Judge-Gemini moved from a 4.0 ACCEPT stance to "I agree my initial score was too lenient... I have adjusted my scores to 3s and 2s" after reading Judge-GPT's 2.92 report — without defending its original scores or challenging GPT's evidence. Classic social-pressure convergence. Root cause: my debate comment said "You may hold your position if you have new evidence; you may move if you find the other reasoning more grounded. Do not split the difference to compromise." That phrasing is both weaker than CEK's intent AND it dropped every structural anti-sycophancy instruction CEK spelled out in judge-with-debate/SKILL.md: Missing: "Identify disagreements (where your scores differ by >1 point)" Missing: "Defend your position with evidence from the specification" Missing: "Challenge the other judge's position with counter-evidence" Missing: "Only revise if you find their evidence compelling" Missing: "Defend your original scores if you still believe them" Also: I asked judges to post a REVISED report (implicitly retracting their prior position). CEK asks them to APPEND a debate round section to their prior report, keeping both visible so the revision is a change ON TOP OF the original rather than a replacement. Fixed by porting CEK's instruction block verbatim into _build_debate_round_comment. Added a regression test that fails if any future edit removes these exact clauses. Tests: 72 passed (+1 regression test).
767 lines
28 KiB
Python
767 lines
28 KiB
Python
"""Tests for the CEK-native review pipeline (meta-judge + 3 judges + consensus)."""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import pathlib
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from coordinator.queue import DebateQueue, Round
|
|
from coordinator.orchestrator import (
|
|
ACCEPT_MIN_SCORE,
|
|
CONSENSUS_CRITERION_THRESHOLD,
|
|
CONSENSUS_OVERALL_THRESHOLD,
|
|
JUDGE_NAMES,
|
|
MAX_DEBATE_ROUNDS,
|
|
META_JUDGE_NAME,
|
|
REWORK_INSTRUCTIONS,
|
|
_advance_awaiting_debate,
|
|
_advance_awaiting_judges,
|
|
_advance_awaiting_rubric,
|
|
_advance_round,
|
|
_apply_verdict,
|
|
_build_coordinator_note_no_agent,
|
|
_build_debate_round_comment,
|
|
_build_judge_mention_comment,
|
|
_build_meta_judge_mention,
|
|
_build_retrigger_comment,
|
|
_check_consensus,
|
|
_criterion_scores,
|
|
_extract_yaml,
|
|
_find_commit_url,
|
|
_find_reply_by_agent,
|
|
_overall_score,
|
|
_parse_judge_report,
|
|
_parse_rubric,
|
|
_post_rejection_retrigger,
|
|
_start_round,
|
|
_utcnow,
|
|
)
|
|
|
|
_logger = logging.getLogger("test.orchestrator")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fakes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class FakeConfig:
|
|
server_url = "http://x"
|
|
workspace_id = "wid"
|
|
token = "tok"
|
|
poll_interval_s = 30
|
|
round_timeout_s = 600
|
|
max_concurrent_rounds = 3
|
|
|
|
|
|
class FakeClient:
|
|
def __init__(self) -> None:
|
|
self.issue: dict[str, Any] = {
|
|
"id": "issue-1",
|
|
"title": "Do the thing",
|
|
"description": "Please do the thing clearly.",
|
|
"status": "in_review",
|
|
"assignee_type": None,
|
|
"assignee_id": None,
|
|
}
|
|
self.comments: list[dict[str, Any]] = []
|
|
self.posted_comments: list[str] = []
|
|
self.agents: list[dict[str, Any]] = [
|
|
{"name": META_JUDGE_NAME, "id": "agent-meta"},
|
|
{"name": "Judge-GPT", "id": "agent-gpt"},
|
|
{"name": "Judge-Claude", "id": "agent-claude"},
|
|
{"name": "Judge-Gemini", "id": "agent-gemini"},
|
|
]
|
|
self._next_comment_id = 1000
|
|
|
|
def get_issue(self, issue_id: str) -> dict[str, Any]:
|
|
return dict(self.issue)
|
|
|
|
def update_issue_status(self, issue_id: str, status: str) -> dict[str, Any]:
|
|
self.issue["status"] = status
|
|
return {"id": issue_id, "status": status}
|
|
|
|
def list_comments(self, issue_id: str) -> list[dict[str, Any]]:
|
|
return list(self.comments)
|
|
|
|
def post_comment(self, issue_id: str, content: str) -> dict[str, Any]:
|
|
self.posted_comments.append(content)
|
|
cid = f"posted-{self._next_comment_id}"
|
|
self._next_comment_id += 1
|
|
created = _utcnow()
|
|
self.comments.append({
|
|
"id": cid,
|
|
"content": content,
|
|
"author_id": "coord-user",
|
|
"created_at": created,
|
|
})
|
|
return {"id": cid, "created_at": created}
|
|
|
|
def list_agents(self) -> list[dict[str, Any]]:
|
|
return list(self.agents)
|
|
|
|
def find_agents_by_name(self, names):
|
|
want = set(names)
|
|
return {a["name"]: a["id"] for a in self.agents if a["name"] in want}
|
|
|
|
def get_agent_name(self, agent_id: str) -> str | None:
|
|
for a in self.agents:
|
|
if a["id"] == agent_id:
|
|
return a["name"]
|
|
return None
|
|
|
|
|
|
def _past(seconds_ago: int) -> str:
|
|
from datetime import timedelta
|
|
t = datetime.now(timezone.utc) - timedelta(seconds=seconds_ago)
|
|
return t.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def _reply_comment(agent_id: str, content: str, created: str | None = None) -> dict[str, Any]:
|
|
return {
|
|
"id": f"reply-{agent_id}",
|
|
"author_id": agent_id,
|
|
"content": content,
|
|
"created_at": created or _utcnow(),
|
|
}
|
|
|
|
|
|
def _rubric_yaml_sample() -> str:
|
|
spec = {
|
|
"checklist": [
|
|
{"question": "Does the code compile?", "category": "hard_rule", "importance": "essential", "rationale": "basic"},
|
|
{"question": "Is documentation present?", "category": "principle", "importance": "important", "rationale": "quality"},
|
|
],
|
|
"rubric_dimensions": [
|
|
{"name": "Correctness", "description": "Does the code work", "scale": "1-5", "weight": 0.6, "score_definitions": {1: "no", 5: "perfect"}},
|
|
{"name": "Clarity", "description": "Readability", "scale": "1-5", "weight": 0.4, "score_definitions": {1: "opaque", 5: "crystal"}},
|
|
],
|
|
}
|
|
return yaml.safe_dump(spec, sort_keys=False)
|
|
|
|
|
|
def _judge_report(final_score: float, criteria: dict[str, float]) -> str:
|
|
report = {
|
|
"evaluation_report": {
|
|
"score_calculation": {"final_score": final_score},
|
|
"rubric_scores": [
|
|
{"name": k, "score": v, "weight": 0.5} for k, v in criteria.items()
|
|
],
|
|
"executive_summary": f"score {final_score}",
|
|
}
|
|
}
|
|
return "```yaml\n" + yaml.safe_dump(report, sort_keys=False) + "\n```"
|
|
|
|
|
|
def _make_round(tmp_path: pathlib.Path, **overrides) -> tuple[Round, DebateQueue]:
|
|
r = Round(
|
|
round_id="r1",
|
|
issue_id="issue-1",
|
|
identifier="WYL-X",
|
|
title="Do the thing",
|
|
enqueued_at=_utcnow(),
|
|
status="running",
|
|
phase="convened",
|
|
phase_entered_at=_utcnow(),
|
|
)
|
|
for k, v in overrides.items():
|
|
setattr(r, k, v)
|
|
q = DebateQueue.load(tmp_path / "queue.json")
|
|
q.rounds.append(r)
|
|
q.save()
|
|
return r, q
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pure helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_extract_yaml_from_fenced_block():
|
|
content = "Here is the rubric:\n\n```yaml\nchecklist:\n - question: foo\n```\n\nDone."
|
|
y = _extract_yaml(content)
|
|
assert y.startswith("checklist:")
|
|
assert "question: foo" in y
|
|
|
|
|
|
def test_extract_yaml_from_unfenced_content():
|
|
content = "checklist:\n - question: foo"
|
|
y = _extract_yaml(content)
|
|
assert y == content.strip()
|
|
|
|
|
|
def test_extract_yaml_unescapes_html_entities():
|
|
# Multica REST API returns comment content with `"` as `"`, `>` as `>`, etc.
|
|
content = "checklist:\n - id: "CK-001"\n question: "does it work?""
|
|
y = _extract_yaml(content)
|
|
assert '"CK-001"' in y
|
|
assert """ not in y
|
|
|
|
|
|
def test_parse_rubric_accepts_html_encoded_input():
|
|
encoded = (
|
|
"rrd_cycle_applied: true\n"
|
|
"evaluation_specification:\n"
|
|
" checklist:\n"
|
|
" - id: "CK-001"\n"
|
|
" question: "does it work?"\n"
|
|
" category: "hard_rule"\n"
|
|
" importance: "essential"\n"
|
|
)
|
|
spec = _parse_rubric(encoded)
|
|
assert spec is not None
|
|
assert "checklist" in spec
|
|
|
|
|
|
def test_extract_yaml_repairs_backslash_backtick():
|
|
# Gemini (and similar) emit \` inside double-quoted YAML strings, imitating
|
|
# markdown escaping. \` is not a valid YAML escape, so we repair it.
|
|
content = "evaluation_report:\n rubric_scores:\n - name: X\n score: 4\n evidence: \"see \\`foo.py\\` and \\`bar.py\\`\"\n"
|
|
y = _extract_yaml(content)
|
|
assert "\\`" not in y
|
|
assert "`foo.py`" in y
|
|
|
|
|
|
def test_parse_judge_report_tolerates_backslash_backtick():
|
|
content = (
|
|
"```yaml\n"
|
|
"evaluation_report:\n"
|
|
" score_calculation:\n"
|
|
" final_score: 4.0\n"
|
|
" rubric_scores:\n"
|
|
" - name: Correctness\n"
|
|
" score: 4\n"
|
|
" weight: 1.0\n"
|
|
" evidence: \"see \\`foo.py\\`\"\n"
|
|
"```"
|
|
)
|
|
r = _parse_judge_report(content)
|
|
assert r is not None
|
|
assert r["score_calculation"]["final_score"] == 4.0
|
|
|
|
|
|
def test_parse_rubric_valid_flat():
|
|
spec = _parse_rubric(f"```yaml\n{_rubric_yaml_sample()}\n```")
|
|
assert spec is not None
|
|
assert "checklist" in spec or "rubric_dimensions" in spec
|
|
|
|
|
|
def test_parse_rubric_valid_wrapped():
|
|
wrapped = yaml.safe_dump({"evaluation_specification": yaml.safe_load(_rubric_yaml_sample())})
|
|
spec = _parse_rubric(f"```yaml\n{wrapped}\n```")
|
|
assert spec is not None
|
|
assert "rubric_dimensions" in spec
|
|
|
|
|
|
def test_parse_rubric_rejects_malformed_yaml():
|
|
assert _parse_rubric("```yaml\nnot: valid: nested: without: quotes\n```") is None
|
|
|
|
|
|
def test_parse_rubric_rejects_yaml_without_expected_keys():
|
|
assert _parse_rubric("```yaml\njust_some: random\n```") is None
|
|
|
|
|
|
def test_parse_judge_report_valid_with_final_score():
|
|
content = _judge_report(3.7, {"Correctness": 4.0, "Clarity": 3.5})
|
|
r = _parse_judge_report(content)
|
|
assert r is not None
|
|
assert r["score_calculation"]["final_score"] == 3.7
|
|
|
|
|
|
def test_parse_judge_report_valid_without_final_score_but_rubric_scores():
|
|
report = {
|
|
"evaluation_report": {
|
|
"rubric_scores": [
|
|
{"name": "Correctness", "score": 4.0, "weight": 1.0},
|
|
]
|
|
}
|
|
}
|
|
content = "```yaml\n" + yaml.safe_dump(report) + "\n```"
|
|
assert _parse_judge_report(content) is not None
|
|
|
|
|
|
def test_parse_judge_report_rejects_empty():
|
|
assert _parse_judge_report("```yaml\nnothing: here\n```") is None
|
|
|
|
|
|
def test_overall_score_prefers_final_score():
|
|
r = {"score_calculation": {"final_score": 2.8}, "rubric_scores": [{"name": "x", "score": 5, "weight": 1}]}
|
|
assert _overall_score(r) == 2.8
|
|
|
|
|
|
def test_overall_score_falls_back_to_weighted_average():
|
|
r = {"rubric_scores": [
|
|
{"name": "a", "score": 4.0, "weight": 0.6},
|
|
{"name": "b", "score": 2.0, "weight": 0.4},
|
|
]}
|
|
assert _overall_score(r) == pytest.approx(3.2)
|
|
|
|
|
|
def test_overall_score_none_when_nothing_to_extract():
|
|
assert _overall_score({}) is None
|
|
|
|
|
|
def test_criterion_scores_extracts_names_and_scores():
|
|
r = {"rubric_scores": [
|
|
{"name": "Correctness", "score": 4.0},
|
|
{"name": "Clarity", "score": 3.0},
|
|
]}
|
|
s = _criterion_scores(r)
|
|
assert s == {"Correctness": 4.0, "Clarity": 3.0}
|
|
|
|
|
|
def test_check_consensus_converged_accept():
|
|
reports = {
|
|
"Judge-GPT": {"score_calculation": {"final_score": 4.0}, "rubric_scores": [{"name": "C", "score": 4}]},
|
|
"Judge-Claude": {"score_calculation": {"final_score": 4.2}, "rubric_scores": [{"name": "C", "score": 4}]},
|
|
"Judge-Gemini": {"score_calculation": {"final_score": 4.1}, "rubric_scores": [{"name": "C", "score": 4}]},
|
|
}
|
|
converged, verdict, avg = _check_consensus(reports)
|
|
assert converged is True
|
|
assert verdict == "ACCEPT"
|
|
assert avg == pytest.approx((4.0 + 4.2 + 4.1) / 3)
|
|
|
|
|
|
def test_check_consensus_converged_reject_low_score():
|
|
reports = {
|
|
n: {"score_calculation": {"final_score": 2.5}, "rubric_scores": [{"name": "C", "score": 2}]}
|
|
for n in JUDGE_NAMES
|
|
}
|
|
converged, verdict, avg = _check_consensus(reports)
|
|
assert converged is True
|
|
assert verdict == "REJECT"
|
|
|
|
|
|
def test_check_consensus_not_converged_overall_spread():
|
|
reports = {
|
|
"Judge-GPT": {"score_calculation": {"final_score": 2.0}},
|
|
"Judge-Claude": {"score_calculation": {"final_score": 4.0}},
|
|
"Judge-Gemini": {"score_calculation": {"final_score": 3.0}},
|
|
}
|
|
converged, verdict, avg = _check_consensus(reports)
|
|
assert converged is False
|
|
assert verdict is None
|
|
assert avg == pytest.approx(3.0)
|
|
|
|
|
|
def test_check_consensus_not_converged_criterion_spread():
|
|
reports = {
|
|
"Judge-GPT": {"score_calculation": {"final_score": 3.0}, "rubric_scores": [{"name": "C", "score": 2}]},
|
|
"Judge-Claude": {"score_calculation": {"final_score": 3.1}, "rubric_scores": [{"name": "C", "score": 5}]},
|
|
"Judge-Gemini": {"score_calculation": {"final_score": 3.0}, "rubric_scores": [{"name": "C", "score": 3}]},
|
|
}
|
|
converged, _, _ = _check_consensus(reports)
|
|
assert converged is False
|
|
|
|
|
|
def test_check_consensus_no_overalls_returns_false():
|
|
reports = {n: {} for n in JUDGE_NAMES}
|
|
converged, verdict, avg = _check_consensus(reports)
|
|
assert converged is False
|
|
assert avg is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Comment builders
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_meta_judge_mention_contains_mention_and_description():
|
|
body = _build_meta_judge_mention("agent-meta", "Title", "Description line 1\nDescription line 2")
|
|
assert "mention://agent/agent-meta" in body
|
|
assert META_JUDGE_NAME in body
|
|
assert "Description line 1" in body
|
|
assert "Description line 2" in body
|
|
|
|
|
|
def test_judge_mention_contains_all_three_mentions_and_rubric():
|
|
judge_ids = {"Judge-GPT": "a", "Judge-Claude": "b", "Judge-Gemini": "c"}
|
|
body = _build_judge_mention_comment(judge_ids, "Title", "Desc", "https://example.com/commit/abc", "rubric: yes")
|
|
for n in JUDGE_NAMES:
|
|
assert n in body
|
|
for agent_id in ("a", "b", "c"):
|
|
assert f"mention://agent/{agent_id}" in body
|
|
assert "rubric: yes" in body
|
|
assert "https://example.com/commit/abc" in body
|
|
|
|
|
|
def test_debate_round_comment_quotes_all_prior_reports():
|
|
judge_ids = {n: f"id-{n}" for n in JUDGE_NAMES}
|
|
prior = {n: f"REPORT FROM {n}" for n in JUDGE_NAMES}
|
|
body = _build_debate_round_comment(judge_ids, 1, prior)
|
|
assert "Debate round 1" in body
|
|
for n in JUDGE_NAMES:
|
|
assert n in body
|
|
assert f"REPORT FROM {n}" in body
|
|
|
|
|
|
def test_debate_round_comment_contains_cek_anti_sycophancy_language():
|
|
"""Regression: do not soften CEK's critical debate instructions.
|
|
|
|
The first live run produced sycophantic convergence because the earlier,
|
|
softer phrasing dropped CEK's explicit 'only revise if compelling / defend
|
|
original if you still believe them' instructions. Any future edit that
|
|
removes these exact clauses should be caught here.
|
|
"""
|
|
judge_ids = {n: f"id-{n}" for n in JUDGE_NAMES}
|
|
body = _build_debate_round_comment(judge_ids, 1, {n: "x" for n in JUDGE_NAMES})
|
|
# CEK's structural instructions
|
|
assert "Identify disagreements (where your scores differ by >1 point)" in body
|
|
assert "Defend your position with evidence" in body
|
|
assert "Challenge the other judge's position with counter-evidence" in body
|
|
# CEK's CRITICAL anti-sycophancy list
|
|
assert "Only revise if you find their evidence compelling." in body
|
|
assert "Defend your original scores if you still believe them." in body
|
|
# APPEND not REVISE
|
|
assert "APPENDS to your prior report" in body
|
|
assert "REVISED" not in body # the old softer phrasing is gone
|
|
|
|
|
|
def test_retrigger_comment_has_anchor_and_no_drift_instructions():
|
|
body = _build_retrigger_comment("Worker", "agent-worker", "Original desc line.", "VERDICT: REJECT", "r1")
|
|
assert "mention://agent/agent-worker" in body
|
|
assert "ANCHOR" in body
|
|
assert "Original desc line." in body
|
|
assert REWORK_INSTRUCTIONS in body
|
|
|
|
|
|
def test_coordinator_note_no_agent_has_no_mention():
|
|
body = _build_coordinator_note_no_agent("r1", "no assignee set")
|
|
assert "mention://" not in body
|
|
assert "Manual follow-up required" in body
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _find_commit_url
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_find_commit_url_picks_latest():
|
|
comments = [
|
|
{"content": "older https://git.example/commit/abc123", "created_at": "2026-01-01T00:00:00Z"},
|
|
{"content": "newer https://git.example/commit/def456", "created_at": "2026-01-02T00:00:00Z"},
|
|
]
|
|
assert _find_commit_url(comments) == "https://git.example/commit/def456"
|
|
|
|
|
|
def test_find_commit_url_empty_when_absent():
|
|
assert _find_commit_url([{"content": "no urls"}]) == ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _find_reply_by_agent
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_find_reply_by_agent_respects_cutoff():
|
|
comments = [
|
|
{"id": "before", "author_id": "agent-x", "content": "early", "created_at": _past(100)},
|
|
{"id": "cutoff", "author_id": "coord", "content": "mention", "created_at": _past(50)},
|
|
{"id": "after", "author_id": "agent-x", "content": "late", "created_at": _past(10)},
|
|
]
|
|
found = _find_reply_by_agent(comments, "agent-x", "cutoff")
|
|
assert found == ("after", "late")
|
|
|
|
|
|
def test_find_reply_by_agent_none_when_no_match():
|
|
comments = [{"id": "x", "author_id": "other", "content": "hi", "created_at": _utcnow()}]
|
|
assert _find_reply_by_agent(comments, "agent-x", "somewhere") is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _start_round
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_start_round_posts_meta_judge_mention_and_sets_phase(tmp_path):
|
|
r, q = _make_round(tmp_path, phase="convened", status="pending")
|
|
client = FakeClient()
|
|
_start_round(r, client, q, FakeConfig(), _logger)
|
|
|
|
assert r.status == "running"
|
|
assert len(client.posted_comments) == 1
|
|
body = client.posted_comments[0]
|
|
assert "mention://agent/agent-meta" in body
|
|
assert r.phase == "awaiting_rubric"
|
|
assert r.meta_judge_comment_id != ""
|
|
|
|
|
|
def test_start_round_marks_error_on_api_failure(tmp_path):
|
|
r, q = _make_round(tmp_path, phase="convened", status="pending")
|
|
|
|
class FailingClient(FakeClient):
|
|
def get_issue(self, issue_id):
|
|
raise RuntimeError("no issue")
|
|
|
|
client = FailingClient()
|
|
_start_round(r, client, q, FakeConfig(), _logger)
|
|
assert r.status == "error"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _advance_awaiting_rubric
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_advance_awaiting_rubric_waits_when_meta_judge_silent(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_rubric", status="running",
|
|
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)}]
|
|
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_rubric"
|
|
assert r.status == "running"
|
|
|
|
|
|
def test_advance_awaiting_rubric_errors_on_malformed_yaml(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_rubric", status="running",
|
|
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
{"id": "reply", "author_id": "agent-meta", "content": "```yaml\nnot: valid: nested\n```", "created_at": _utcnow()},
|
|
]
|
|
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "error"
|
|
assert r.status == "error"
|
|
|
|
|
|
def test_advance_awaiting_rubric_moves_to_awaiting_judges_on_valid_rubric(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_rubric", status="running",
|
|
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
{"id": "reply", "author_id": "agent-meta", "content": f"```yaml\n{_rubric_yaml_sample()}\n```", "created_at": _utcnow()},
|
|
]
|
|
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_judges"
|
|
assert r.rubric_yaml != ""
|
|
assert any("mention://agent/agent-gpt" in c for c in client.posted_comments)
|
|
assert any("mention://agent/agent-claude" in c for c in client.posted_comments)
|
|
assert any("mention://agent/agent-gemini" in c for c in client.posted_comments)
|
|
|
|
|
|
def test_advance_awaiting_rubric_errors_on_timeout_without_reply(tmp_path):
|
|
cfg = FakeConfig()
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_rubric", status="running",
|
|
meta_judge_comment_id="meta-c", phase_entered_at=_past(cfg.round_timeout_s + 5),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(cfg.round_timeout_s + 5)}]
|
|
_advance_awaiting_rubric(r, client, q, cfg, _logger)
|
|
assert r.phase == "error"
|
|
assert r.status == "error"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _advance_awaiting_judges
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_advance_awaiting_judges_waits_when_missing_reports(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(3.5, {"Correctness": 4})),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_judges"
|
|
|
|
|
|
def test_advance_awaiting_judges_accepts_on_consensus(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(4.0, {"Correctness": 4})),
|
|
_reply_comment("agent-claude", _judge_report(4.1, {"Correctness": 4})),
|
|
_reply_comment("agent-gemini", _judge_report(4.2, {"Correctness": 4})),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "accepted"
|
|
assert r.status == "done"
|
|
assert client.issue["status"] == "done"
|
|
|
|
|
|
def test_advance_awaiting_judges_rejects_on_consensus_low(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.issue["assignee_type"] = "agent"
|
|
client.issue["assignee_id"] = "agent-worker"
|
|
client.agents.append({"name": "Worker", "id": "agent-worker"})
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
|
|
_reply_comment("agent-claude", _judge_report(2.2, {"Correctness": 2})),
|
|
_reply_comment("agent-gemini", _judge_report(2.1, {"Correctness": 2})),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "rejected"
|
|
assert r.status == "done"
|
|
assert client.issue["status"] == "in_progress"
|
|
assert any("mention://agent/agent-worker" in c for c in client.posted_comments)
|
|
|
|
|
|
def test_advance_awaiting_judges_starts_debate_round_when_spread(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(), debate_round=0,
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
|
|
_reply_comment("agent-claude", _judge_report(4.0, {"Correctness": 4})),
|
|
_reply_comment("agent-gemini", _judge_report(3.0, {"Correctness": 3})),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_debate"
|
|
assert r.debate_round == 1
|
|
|
|
|
|
def test_advance_awaiting_judges_errors_when_no_parseable(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_past(FakeConfig.round_timeout_s + 5),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(FakeConfig.round_timeout_s + 5)},
|
|
_reply_comment("agent-gpt", "not yaml at all"),
|
|
_reply_comment("agent-claude", "also not yaml"),
|
|
_reply_comment("agent-gemini", "garbage"),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "error"
|
|
assert r.status == "error"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Debate round cap
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_awaiting_debate_errors_out_at_cap(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_debate", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
|
|
debate_round=MAX_DEBATE_ROUNDS,
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
|
|
_reply_comment("agent-claude", _judge_report(4.0, {"Correctness": 4})),
|
|
_reply_comment("agent-gemini", _judge_report(3.0, {"Correctness": 3})),
|
|
]
|
|
_advance_awaiting_debate(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "error"
|
|
assert r.status == "error"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Race fix: issue status moves BEFORE round marked done
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_apply_verdict_updates_issue_before_marking_round_done(tmp_path):
|
|
call_order: list[str] = []
|
|
|
|
class TrackingClient(FakeClient):
|
|
def update_issue_status(self, issue_id, status):
|
|
call_order.append(f"issue:{status}")
|
|
return super().update_issue_status(issue_id, status)
|
|
|
|
class TrackingQueue(DebateQueue):
|
|
def update_status(self, round_id, status):
|
|
call_order.append(f"round:{status}")
|
|
super().update_status(round_id, status)
|
|
|
|
r = Round(
|
|
round_id="r1", issue_id="issue-1", identifier="WYL-X", title="t",
|
|
enqueued_at=_utcnow(), status="running", phase="awaiting_judges",
|
|
phase_entered_at=_utcnow(),
|
|
)
|
|
q = TrackingQueue.load(tmp_path / "queue.json")
|
|
q.rounds.append(r)
|
|
q.save()
|
|
client = TrackingClient()
|
|
|
|
_apply_verdict(r, client, q, "ACCEPT", "VERDICT: ACCEPT\nScore 4.0", _logger)
|
|
|
|
assert "issue:done" in call_order
|
|
assert "round:done" in call_order
|
|
assert call_order.index("issue:done") < call_order.index("round:done")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _post_rejection_retrigger corner cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_reject_retrigger_skipped_for_member_assignee(tmp_path):
|
|
r, q = _make_round(tmp_path)
|
|
client = FakeClient()
|
|
client.issue["assignee_type"] = "member"
|
|
client.issue["assignee_id"] = "user-1"
|
|
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
|
|
assert cid is None
|
|
assert len(client.posted_comments) == 1
|
|
assert "mention://" not in client.posted_comments[0]
|
|
assert "Manual follow-up" in client.posted_comments[0]
|
|
|
|
|
|
def test_reject_retrigger_skipped_when_no_assignee(tmp_path):
|
|
r, q = _make_round(tmp_path)
|
|
client = FakeClient()
|
|
client.issue["assignee_type"] = None
|
|
client.issue["assignee_id"] = None
|
|
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
|
|
assert cid is None
|
|
|
|
|
|
def test_reject_retrigger_includes_verbatim_description(tmp_path):
|
|
r, q = _make_round(tmp_path)
|
|
client = FakeClient()
|
|
client.issue["assignee_type"] = "agent"
|
|
client.issue["assignee_id"] = "agent-worker"
|
|
client.agents.append({"name": "Worker", "id": "agent-worker"})
|
|
desc = "Line A.\n\nLine B with unique-marker-123.\n\nLine C."
|
|
client.issue["description"] = desc
|
|
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
|
|
assert cid is not None
|
|
body = client.posted_comments[-1]
|
|
for line in desc.splitlines():
|
|
if line.strip():
|
|
assert line in body
|
|
assert REWORK_INSTRUCTIONS in body
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _advance_round dispatch
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_advance_round_dispatches_convened_to_start(tmp_path):
|
|
r, q = _make_round(tmp_path, phase="convened", status="pending")
|
|
client = FakeClient()
|
|
_advance_round(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_rubric"
|
|
|
|
|
|
def test_advance_round_corrects_terminal_phase_with_wrong_status(tmp_path):
|
|
r, q = _make_round(tmp_path, phase="accepted", status="running")
|
|
client = FakeClient()
|
|
_advance_round(r, client, q, FakeConfig(), _logger)
|
|
assert r.status == "done"
|