f88255096e
The prior pipeline (4 hand-written debater prompts + 1 judge with my prompt
template) kept missing scope drift because every prompt was mine and the
reviewers were all on the same model tier with correlated priors.
This commit replaces the whole review step with CEK's judge-with-debate
pattern translated to multica-native execution:
pending → awaiting_rubric (meta-judge writes YAML spec from issue alone)
→ awaiting_judges (3 judges on 3 copilot models score independently)
→ consensus check (overall within 0.5, criteria within 1.0)
→ accept or reject OR awaiting_debate rounds up to 3
→ error on malformed YAML or cap hit
Per higher-management direction, we do not deal with a model that cannot
produce YAML: malformed rubric or all-unparseable judge reports fail the
round immediately (no retries, no fallback to hand-written prompts).
The anchor retrigger on REJECT (WYL-51 behaviour) is preserved verbatim.
Agent prompts for meta-judge and the 3 judges come from the CEK agents
themselves (Meta-Judge / Judge-GPT / Judge-Claude / Judge-Gemini) whose
`instructions` field is the CEK meta-judge.md / judge.md files uploaded
byte-for-byte. No prompts are authored in this coordinator's source.
Adds pyyaml dependency.
- src/coordinator/orchestrator.py: rewritten for the new phase machine
- src/coordinator/queue.py: Round extended with rubric_yaml, judge_report_comment_ids, debate_round
- tests/test_orchestrator.py: 40 tests for new pipeline (helpers, parsers, consensus math, phase handlers, race fix, retrigger)
- tests/test_integration.py: removed (tested old debater pipeline)
- pyproject.toml: adds pyyaml
Tests: 67 passed in 0.20s (40 orchestrator + 15 queue + 7 watcher + 5 other).
695 lines
26 KiB
Python
695 lines
26 KiB
Python
"""Tests for the CEK-native review pipeline (meta-judge + 3 judges + consensus)."""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import pathlib
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from coordinator.queue import DebateQueue, Round
|
|
from coordinator.orchestrator import (
|
|
ACCEPT_MIN_SCORE,
|
|
CONSENSUS_CRITERION_THRESHOLD,
|
|
CONSENSUS_OVERALL_THRESHOLD,
|
|
JUDGE_NAMES,
|
|
MAX_DEBATE_ROUNDS,
|
|
META_JUDGE_NAME,
|
|
REWORK_INSTRUCTIONS,
|
|
_advance_awaiting_debate,
|
|
_advance_awaiting_judges,
|
|
_advance_awaiting_rubric,
|
|
_advance_round,
|
|
_apply_verdict,
|
|
_build_coordinator_note_no_agent,
|
|
_build_debate_round_comment,
|
|
_build_judge_mention_comment,
|
|
_build_meta_judge_mention,
|
|
_build_retrigger_comment,
|
|
_check_consensus,
|
|
_criterion_scores,
|
|
_extract_yaml,
|
|
_find_commit_url,
|
|
_find_reply_by_agent,
|
|
_overall_score,
|
|
_parse_judge_report,
|
|
_parse_rubric,
|
|
_post_rejection_retrigger,
|
|
_start_round,
|
|
_utcnow,
|
|
)
|
|
|
|
_logger = logging.getLogger("test.orchestrator")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fakes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class FakeConfig:
|
|
server_url = "http://x"
|
|
workspace_id = "wid"
|
|
token = "tok"
|
|
poll_interval_s = 30
|
|
round_timeout_s = 600
|
|
max_concurrent_rounds = 3
|
|
|
|
|
|
class FakeClient:
|
|
def __init__(self) -> None:
|
|
self.issue: dict[str, Any] = {
|
|
"id": "issue-1",
|
|
"title": "Do the thing",
|
|
"description": "Please do the thing clearly.",
|
|
"status": "in_review",
|
|
"assignee_type": None,
|
|
"assignee_id": None,
|
|
}
|
|
self.comments: list[dict[str, Any]] = []
|
|
self.posted_comments: list[str] = []
|
|
self.agents: list[dict[str, Any]] = [
|
|
{"name": META_JUDGE_NAME, "id": "agent-meta"},
|
|
{"name": "Judge-GPT", "id": "agent-gpt"},
|
|
{"name": "Judge-Claude", "id": "agent-claude"},
|
|
{"name": "Judge-Gemini", "id": "agent-gemini"},
|
|
]
|
|
self._next_comment_id = 1000
|
|
|
|
def get_issue(self, issue_id: str) -> dict[str, Any]:
|
|
return dict(self.issue)
|
|
|
|
def update_issue_status(self, issue_id: str, status: str) -> dict[str, Any]:
|
|
self.issue["status"] = status
|
|
return {"id": issue_id, "status": status}
|
|
|
|
def list_comments(self, issue_id: str) -> list[dict[str, Any]]:
|
|
return list(self.comments)
|
|
|
|
def post_comment(self, issue_id: str, content: str) -> dict[str, Any]:
|
|
self.posted_comments.append(content)
|
|
cid = f"posted-{self._next_comment_id}"
|
|
self._next_comment_id += 1
|
|
created = _utcnow()
|
|
self.comments.append({
|
|
"id": cid,
|
|
"content": content,
|
|
"author_id": "coord-user",
|
|
"created_at": created,
|
|
})
|
|
return {"id": cid, "created_at": created}
|
|
|
|
def list_agents(self) -> list[dict[str, Any]]:
|
|
return list(self.agents)
|
|
|
|
def find_agents_by_name(self, names):
|
|
want = set(names)
|
|
return {a["name"]: a["id"] for a in self.agents if a["name"] in want}
|
|
|
|
def get_agent_name(self, agent_id: str) -> str | None:
|
|
for a in self.agents:
|
|
if a["id"] == agent_id:
|
|
return a["name"]
|
|
return None
|
|
|
|
|
|
def _past(seconds_ago: int) -> str:
|
|
from datetime import timedelta
|
|
t = datetime.now(timezone.utc) - timedelta(seconds=seconds_ago)
|
|
return t.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def _reply_comment(agent_id: str, content: str, created: str | None = None) -> dict[str, Any]:
|
|
return {
|
|
"id": f"reply-{agent_id}",
|
|
"author_id": agent_id,
|
|
"content": content,
|
|
"created_at": created or _utcnow(),
|
|
}
|
|
|
|
|
|
def _rubric_yaml_sample() -> str:
|
|
spec = {
|
|
"checklist": [
|
|
{"question": "Does the code compile?", "category": "hard_rule", "importance": "essential", "rationale": "basic"},
|
|
{"question": "Is documentation present?", "category": "principle", "importance": "important", "rationale": "quality"},
|
|
],
|
|
"rubric_dimensions": [
|
|
{"name": "Correctness", "description": "Does the code work", "scale": "1-5", "weight": 0.6, "score_definitions": {1: "no", 5: "perfect"}},
|
|
{"name": "Clarity", "description": "Readability", "scale": "1-5", "weight": 0.4, "score_definitions": {1: "opaque", 5: "crystal"}},
|
|
],
|
|
}
|
|
return yaml.safe_dump(spec, sort_keys=False)
|
|
|
|
|
|
def _judge_report(final_score: float, criteria: dict[str, float]) -> str:
|
|
report = {
|
|
"evaluation_report": {
|
|
"score_calculation": {"final_score": final_score},
|
|
"rubric_scores": [
|
|
{"name": k, "score": v, "weight": 0.5} for k, v in criteria.items()
|
|
],
|
|
"executive_summary": f"score {final_score}",
|
|
}
|
|
}
|
|
return "```yaml\n" + yaml.safe_dump(report, sort_keys=False) + "\n```"
|
|
|
|
|
|
def _make_round(tmp_path: pathlib.Path, **overrides) -> tuple[Round, DebateQueue]:
|
|
r = Round(
|
|
round_id="r1",
|
|
issue_id="issue-1",
|
|
identifier="WYL-X",
|
|
title="Do the thing",
|
|
enqueued_at=_utcnow(),
|
|
status="running",
|
|
phase="convened",
|
|
phase_entered_at=_utcnow(),
|
|
)
|
|
for k, v in overrides.items():
|
|
setattr(r, k, v)
|
|
q = DebateQueue.load(tmp_path / "queue.json")
|
|
q.rounds.append(r)
|
|
q.save()
|
|
return r, q
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pure helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_extract_yaml_from_fenced_block():
|
|
content = "Here is the rubric:\n\n```yaml\nchecklist:\n - question: foo\n```\n\nDone."
|
|
y = _extract_yaml(content)
|
|
assert y.startswith("checklist:")
|
|
assert "question: foo" in y
|
|
|
|
|
|
def test_extract_yaml_from_unfenced_content():
|
|
content = "checklist:\n - question: foo"
|
|
y = _extract_yaml(content)
|
|
assert y == content.strip()
|
|
|
|
|
|
def test_parse_rubric_valid_flat():
|
|
spec = _parse_rubric(f"```yaml\n{_rubric_yaml_sample()}\n```")
|
|
assert spec is not None
|
|
assert "checklist" in spec or "rubric_dimensions" in spec
|
|
|
|
|
|
def test_parse_rubric_valid_wrapped():
|
|
wrapped = yaml.safe_dump({"evaluation_specification": yaml.safe_load(_rubric_yaml_sample())})
|
|
spec = _parse_rubric(f"```yaml\n{wrapped}\n```")
|
|
assert spec is not None
|
|
assert "rubric_dimensions" in spec
|
|
|
|
|
|
def test_parse_rubric_rejects_malformed_yaml():
|
|
assert _parse_rubric("```yaml\nnot: valid: nested: without: quotes\n```") is None
|
|
|
|
|
|
def test_parse_rubric_rejects_yaml_without_expected_keys():
|
|
assert _parse_rubric("```yaml\njust_some: random\n```") is None
|
|
|
|
|
|
def test_parse_judge_report_valid_with_final_score():
|
|
content = _judge_report(3.7, {"Correctness": 4.0, "Clarity": 3.5})
|
|
r = _parse_judge_report(content)
|
|
assert r is not None
|
|
assert r["score_calculation"]["final_score"] == 3.7
|
|
|
|
|
|
def test_parse_judge_report_valid_without_final_score_but_rubric_scores():
|
|
report = {
|
|
"evaluation_report": {
|
|
"rubric_scores": [
|
|
{"name": "Correctness", "score": 4.0, "weight": 1.0},
|
|
]
|
|
}
|
|
}
|
|
content = "```yaml\n" + yaml.safe_dump(report) + "\n```"
|
|
assert _parse_judge_report(content) is not None
|
|
|
|
|
|
def test_parse_judge_report_rejects_empty():
|
|
assert _parse_judge_report("```yaml\nnothing: here\n```") is None
|
|
|
|
|
|
def test_overall_score_prefers_final_score():
|
|
r = {"score_calculation": {"final_score": 2.8}, "rubric_scores": [{"name": "x", "score": 5, "weight": 1}]}
|
|
assert _overall_score(r) == 2.8
|
|
|
|
|
|
def test_overall_score_falls_back_to_weighted_average():
|
|
r = {"rubric_scores": [
|
|
{"name": "a", "score": 4.0, "weight": 0.6},
|
|
{"name": "b", "score": 2.0, "weight": 0.4},
|
|
]}
|
|
assert _overall_score(r) == pytest.approx(3.2)
|
|
|
|
|
|
def test_overall_score_none_when_nothing_to_extract():
|
|
assert _overall_score({}) is None
|
|
|
|
|
|
def test_criterion_scores_extracts_names_and_scores():
|
|
r = {"rubric_scores": [
|
|
{"name": "Correctness", "score": 4.0},
|
|
{"name": "Clarity", "score": 3.0},
|
|
]}
|
|
s = _criterion_scores(r)
|
|
assert s == {"Correctness": 4.0, "Clarity": 3.0}
|
|
|
|
|
|
def test_check_consensus_converged_accept():
|
|
reports = {
|
|
"Judge-GPT": {"score_calculation": {"final_score": 4.0}, "rubric_scores": [{"name": "C", "score": 4}]},
|
|
"Judge-Claude": {"score_calculation": {"final_score": 4.2}, "rubric_scores": [{"name": "C", "score": 4}]},
|
|
"Judge-Gemini": {"score_calculation": {"final_score": 4.1}, "rubric_scores": [{"name": "C", "score": 4}]},
|
|
}
|
|
converged, verdict, avg = _check_consensus(reports)
|
|
assert converged is True
|
|
assert verdict == "ACCEPT"
|
|
assert avg == pytest.approx((4.0 + 4.2 + 4.1) / 3)
|
|
|
|
|
|
def test_check_consensus_converged_reject_low_score():
|
|
reports = {
|
|
n: {"score_calculation": {"final_score": 2.5}, "rubric_scores": [{"name": "C", "score": 2}]}
|
|
for n in JUDGE_NAMES
|
|
}
|
|
converged, verdict, avg = _check_consensus(reports)
|
|
assert converged is True
|
|
assert verdict == "REJECT"
|
|
|
|
|
|
def test_check_consensus_not_converged_overall_spread():
|
|
reports = {
|
|
"Judge-GPT": {"score_calculation": {"final_score": 2.0}},
|
|
"Judge-Claude": {"score_calculation": {"final_score": 4.0}},
|
|
"Judge-Gemini": {"score_calculation": {"final_score": 3.0}},
|
|
}
|
|
converged, verdict, avg = _check_consensus(reports)
|
|
assert converged is False
|
|
assert verdict is None
|
|
assert avg == pytest.approx(3.0)
|
|
|
|
|
|
def test_check_consensus_not_converged_criterion_spread():
|
|
reports = {
|
|
"Judge-GPT": {"score_calculation": {"final_score": 3.0}, "rubric_scores": [{"name": "C", "score": 2}]},
|
|
"Judge-Claude": {"score_calculation": {"final_score": 3.1}, "rubric_scores": [{"name": "C", "score": 5}]},
|
|
"Judge-Gemini": {"score_calculation": {"final_score": 3.0}, "rubric_scores": [{"name": "C", "score": 3}]},
|
|
}
|
|
converged, _, _ = _check_consensus(reports)
|
|
assert converged is False
|
|
|
|
|
|
def test_check_consensus_no_overalls_returns_false():
|
|
reports = {n: {} for n in JUDGE_NAMES}
|
|
converged, verdict, avg = _check_consensus(reports)
|
|
assert converged is False
|
|
assert avg is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Comment builders
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_meta_judge_mention_contains_mention_and_description():
|
|
body = _build_meta_judge_mention("agent-meta", "Title", "Description line 1\nDescription line 2")
|
|
assert "mention://agent/agent-meta" in body
|
|
assert META_JUDGE_NAME in body
|
|
assert "Description line 1" in body
|
|
assert "Description line 2" in body
|
|
|
|
|
|
def test_judge_mention_contains_all_three_mentions_and_rubric():
|
|
judge_ids = {"Judge-GPT": "a", "Judge-Claude": "b", "Judge-Gemini": "c"}
|
|
body = _build_judge_mention_comment(judge_ids, "Title", "Desc", "https://example.com/commit/abc", "rubric: yes")
|
|
for n in JUDGE_NAMES:
|
|
assert n in body
|
|
for agent_id in ("a", "b", "c"):
|
|
assert f"mention://agent/{agent_id}" in body
|
|
assert "rubric: yes" in body
|
|
assert "https://example.com/commit/abc" in body
|
|
|
|
|
|
def test_debate_round_comment_quotes_all_prior_reports():
|
|
judge_ids = {n: f"id-{n}" for n in JUDGE_NAMES}
|
|
prior = {n: f"REPORT FROM {n}" for n in JUDGE_NAMES}
|
|
body = _build_debate_round_comment(judge_ids, 1, prior)
|
|
assert "Debate round 1" in body
|
|
for n in JUDGE_NAMES:
|
|
assert n in body
|
|
assert f"REPORT FROM {n}" in body
|
|
|
|
|
|
def test_retrigger_comment_has_anchor_and_no_drift_instructions():
|
|
body = _build_retrigger_comment("Worker", "agent-worker", "Original desc line.", "VERDICT: REJECT", "r1")
|
|
assert "mention://agent/agent-worker" in body
|
|
assert "ANCHOR" in body
|
|
assert "Original desc line." in body
|
|
assert REWORK_INSTRUCTIONS in body
|
|
|
|
|
|
def test_coordinator_note_no_agent_has_no_mention():
|
|
body = _build_coordinator_note_no_agent("r1", "no assignee set")
|
|
assert "mention://" not in body
|
|
assert "Manual follow-up required" in body
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _find_commit_url
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_find_commit_url_picks_latest():
|
|
comments = [
|
|
{"content": "older https://git.example/commit/abc123", "created_at": "2026-01-01T00:00:00Z"},
|
|
{"content": "newer https://git.example/commit/def456", "created_at": "2026-01-02T00:00:00Z"},
|
|
]
|
|
assert _find_commit_url(comments) == "https://git.example/commit/def456"
|
|
|
|
|
|
def test_find_commit_url_empty_when_absent():
|
|
assert _find_commit_url([{"content": "no urls"}]) == ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _find_reply_by_agent
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_find_reply_by_agent_respects_cutoff():
|
|
comments = [
|
|
{"id": "before", "author_id": "agent-x", "content": "early", "created_at": _past(100)},
|
|
{"id": "cutoff", "author_id": "coord", "content": "mention", "created_at": _past(50)},
|
|
{"id": "after", "author_id": "agent-x", "content": "late", "created_at": _past(10)},
|
|
]
|
|
found = _find_reply_by_agent(comments, "agent-x", "cutoff")
|
|
assert found == ("after", "late")
|
|
|
|
|
|
def test_find_reply_by_agent_none_when_no_match():
|
|
comments = [{"id": "x", "author_id": "other", "content": "hi", "created_at": _utcnow()}]
|
|
assert _find_reply_by_agent(comments, "agent-x", "somewhere") is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _start_round
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_start_round_posts_meta_judge_mention_and_sets_phase(tmp_path):
|
|
r, q = _make_round(tmp_path, phase="convened", status="pending")
|
|
client = FakeClient()
|
|
_start_round(r, client, q, FakeConfig(), _logger)
|
|
|
|
assert r.status == "running"
|
|
assert len(client.posted_comments) == 1
|
|
body = client.posted_comments[0]
|
|
assert "mention://agent/agent-meta" in body
|
|
assert r.phase == "awaiting_rubric"
|
|
assert r.meta_judge_comment_id != ""
|
|
|
|
|
|
def test_start_round_marks_error_on_api_failure(tmp_path):
|
|
r, q = _make_round(tmp_path, phase="convened", status="pending")
|
|
|
|
class FailingClient(FakeClient):
|
|
def get_issue(self, issue_id):
|
|
raise RuntimeError("no issue")
|
|
|
|
client = FailingClient()
|
|
_start_round(r, client, q, FakeConfig(), _logger)
|
|
assert r.status == "error"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _advance_awaiting_rubric
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_advance_awaiting_rubric_waits_when_meta_judge_silent(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_rubric", status="running",
|
|
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)}]
|
|
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_rubric"
|
|
assert r.status == "running"
|
|
|
|
|
|
def test_advance_awaiting_rubric_errors_on_malformed_yaml(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_rubric", status="running",
|
|
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
{"id": "reply", "author_id": "agent-meta", "content": "```yaml\nnot: valid: nested\n```", "created_at": _utcnow()},
|
|
]
|
|
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "error"
|
|
assert r.status == "error"
|
|
|
|
|
|
def test_advance_awaiting_rubric_moves_to_awaiting_judges_on_valid_rubric(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_rubric", status="running",
|
|
meta_judge_comment_id="meta-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
{"id": "reply", "author_id": "agent-meta", "content": f"```yaml\n{_rubric_yaml_sample()}\n```", "created_at": _utcnow()},
|
|
]
|
|
_advance_awaiting_rubric(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_judges"
|
|
assert r.rubric_yaml != ""
|
|
assert any("mention://agent/agent-gpt" in c for c in client.posted_comments)
|
|
assert any("mention://agent/agent-claude" in c for c in client.posted_comments)
|
|
assert any("mention://agent/agent-gemini" in c for c in client.posted_comments)
|
|
|
|
|
|
def test_advance_awaiting_rubric_errors_on_timeout_without_reply(tmp_path):
|
|
cfg = FakeConfig()
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_rubric", status="running",
|
|
meta_judge_comment_id="meta-c", phase_entered_at=_past(cfg.round_timeout_s + 5),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [{"id": "meta-c", "author_id": "coord", "content": "mention", "created_at": _past(cfg.round_timeout_s + 5)}]
|
|
_advance_awaiting_rubric(r, client, q, cfg, _logger)
|
|
assert r.phase == "error"
|
|
assert r.status == "error"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _advance_awaiting_judges
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_advance_awaiting_judges_waits_when_missing_reports(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(3.5, {"Correctness": 4})),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_judges"
|
|
|
|
|
|
def test_advance_awaiting_judges_accepts_on_consensus(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(4.0, {"Correctness": 4})),
|
|
_reply_comment("agent-claude", _judge_report(4.1, {"Correctness": 4})),
|
|
_reply_comment("agent-gemini", _judge_report(4.2, {"Correctness": 4})),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "accepted"
|
|
assert r.status == "done"
|
|
assert client.issue["status"] == "done"
|
|
|
|
|
|
def test_advance_awaiting_judges_rejects_on_consensus_low(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
|
|
)
|
|
client = FakeClient()
|
|
client.issue["assignee_type"] = "agent"
|
|
client.issue["assignee_id"] = "agent-worker"
|
|
client.agents.append({"name": "Worker", "id": "agent-worker"})
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
|
|
_reply_comment("agent-claude", _judge_report(2.2, {"Correctness": 2})),
|
|
_reply_comment("agent-gemini", _judge_report(2.1, {"Correctness": 2})),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "rejected"
|
|
assert r.status == "done"
|
|
assert client.issue["status"] == "in_progress"
|
|
assert any("mention://agent/agent-worker" in c for c in client.posted_comments)
|
|
|
|
|
|
def test_advance_awaiting_judges_starts_debate_round_when_spread(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(), debate_round=0,
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
|
|
_reply_comment("agent-claude", _judge_report(4.0, {"Correctness": 4})),
|
|
_reply_comment("agent-gemini", _judge_report(3.0, {"Correctness": 3})),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_debate"
|
|
assert r.debate_round == 1
|
|
|
|
|
|
def test_advance_awaiting_judges_errors_when_no_parseable(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_judges", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_past(FakeConfig.round_timeout_s + 5),
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(FakeConfig.round_timeout_s + 5)},
|
|
_reply_comment("agent-gpt", "not yaml at all"),
|
|
_reply_comment("agent-claude", "also not yaml"),
|
|
_reply_comment("agent-gemini", "garbage"),
|
|
]
|
|
_advance_awaiting_judges(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "error"
|
|
assert r.status == "error"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Debate round cap
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_awaiting_debate_errors_out_at_cap(tmp_path):
|
|
r, q = _make_round(
|
|
tmp_path, phase="awaiting_debate", status="running",
|
|
judge_mention_comment_id="jm-c", phase_entered_at=_utcnow(),
|
|
debate_round=MAX_DEBATE_ROUNDS,
|
|
)
|
|
client = FakeClient()
|
|
client.comments = [
|
|
{"id": "jm-c", "author_id": "coord", "content": "mention", "created_at": _past(5)},
|
|
_reply_comment("agent-gpt", _judge_report(2.0, {"Correctness": 2})),
|
|
_reply_comment("agent-claude", _judge_report(4.0, {"Correctness": 4})),
|
|
_reply_comment("agent-gemini", _judge_report(3.0, {"Correctness": 3})),
|
|
]
|
|
_advance_awaiting_debate(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "error"
|
|
assert r.status == "error"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Race fix: issue status moves BEFORE round marked done
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_apply_verdict_updates_issue_before_marking_round_done(tmp_path):
|
|
call_order: list[str] = []
|
|
|
|
class TrackingClient(FakeClient):
|
|
def update_issue_status(self, issue_id, status):
|
|
call_order.append(f"issue:{status}")
|
|
return super().update_issue_status(issue_id, status)
|
|
|
|
class TrackingQueue(DebateQueue):
|
|
def update_status(self, round_id, status):
|
|
call_order.append(f"round:{status}")
|
|
super().update_status(round_id, status)
|
|
|
|
r = Round(
|
|
round_id="r1", issue_id="issue-1", identifier="WYL-X", title="t",
|
|
enqueued_at=_utcnow(), status="running", phase="awaiting_judges",
|
|
phase_entered_at=_utcnow(),
|
|
)
|
|
q = TrackingQueue.load(tmp_path / "queue.json")
|
|
q.rounds.append(r)
|
|
q.save()
|
|
client = TrackingClient()
|
|
|
|
_apply_verdict(r, client, q, "ACCEPT", "VERDICT: ACCEPT\nScore 4.0", _logger)
|
|
|
|
assert "issue:done" in call_order
|
|
assert "round:done" in call_order
|
|
assert call_order.index("issue:done") < call_order.index("round:done")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _post_rejection_retrigger corner cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_reject_retrigger_skipped_for_member_assignee(tmp_path):
|
|
r, q = _make_round(tmp_path)
|
|
client = FakeClient()
|
|
client.issue["assignee_type"] = "member"
|
|
client.issue["assignee_id"] = "user-1"
|
|
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
|
|
assert cid is None
|
|
assert len(client.posted_comments) == 1
|
|
assert "mention://" not in client.posted_comments[0]
|
|
assert "Manual follow-up" in client.posted_comments[0]
|
|
|
|
|
|
def test_reject_retrigger_skipped_when_no_assignee(tmp_path):
|
|
r, q = _make_round(tmp_path)
|
|
client = FakeClient()
|
|
client.issue["assignee_type"] = None
|
|
client.issue["assignee_id"] = None
|
|
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
|
|
assert cid is None
|
|
|
|
|
|
def test_reject_retrigger_includes_verbatim_description(tmp_path):
|
|
r, q = _make_round(tmp_path)
|
|
client = FakeClient()
|
|
client.issue["assignee_type"] = "agent"
|
|
client.issue["assignee_id"] = "agent-worker"
|
|
client.agents.append({"name": "Worker", "id": "agent-worker"})
|
|
desc = "Line A.\n\nLine B with unique-marker-123.\n\nLine C."
|
|
client.issue["description"] = desc
|
|
cid = _post_rejection_retrigger(r, client, client.issue, "VERDICT: REJECT", _logger)
|
|
assert cid is not None
|
|
body = client.posted_comments[-1]
|
|
for line in desc.splitlines():
|
|
if line.strip():
|
|
assert line in body
|
|
assert REWORK_INSTRUCTIONS in body
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _advance_round dispatch
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_advance_round_dispatches_convened_to_start(tmp_path):
|
|
r, q = _make_round(tmp_path, phase="convened", status="pending")
|
|
client = FakeClient()
|
|
_advance_round(r, client, q, FakeConfig(), _logger)
|
|
assert r.phase == "awaiting_rubric"
|
|
|
|
|
|
def test_advance_round_corrects_terminal_phase_with_wrong_status(tmp_path):
|
|
r, q = _make_round(tmp_path, phase="accepted", status="running")
|
|
client = FakeClient()
|
|
_advance_round(r, client, q, FakeConfig(), _logger)
|
|
assert r.status == "done"
|