Judges renamed to neutral A/B/C; Execution Context override applied to both meta-judge and judge payloads

- Rename JUDGE_NAMES from [Judge-GPT, Judge-Claude, Judge-Gemini] to
  [Judge A, Judge B, Judge C] everywhere (orchestrator + tests + ad-hoc
  A/B script). Removes the Claude-family naming anchor that was
  repeatedly pulling the panel toward Claude models even when Student
  plan blocked them — the slot is just "one of three judges", not
  "the Claude judge".

- Apply the A/B-test-winning headless override (EXECUTION_CONTEXT_BLOCK)
  to both _build_meta_judge_chat_payload and _build_judge_chat_payload.
  Phrases are verbatim from OpenAI GPT-5 Prompting Guide + Anthropic
  Claude headless docs. Tells agents there's no human to answer
  clarifying questions; to commit to best-default interpretation and
  document the assumption.

End-to-end pipeline run on WYL-77 (2026-04-20) confirmed:
  - Meta-judge produced rubric without clarifying question (override
    working on ambiguous Tierra prompt)
  - 3 judges ran in parallel without SQLite contention (XDG fix held)
  - Consensus math gracefully excluded Judge B's malformed YAML
  - Verdict REJECT avg=1.00 on prose-only worker delivery
  - Retrigger fired; worker honored anchor + blocked path

Known blockers surfaced (not fixed in this commit):
  - Shared HOME allows cross-daemon workdir snooping (Judge B read AI
    Engineer's task workspace directly). User flagged as feature for
    now; revisit if it causes drift.
  - gpt-5.4-mini (Judge B) produced malformed YAML on this run; n=1,
    can't distinguish chance vs consistent inability — need multi-run
    baseline to decide.
  - REJECT on "no commit URL" conflates missing-delivery with
    bad-work; pipeline signal is correct but reasoning upstream of
    judges is unclear.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-20 02:42:46 +02:00
parent 645778d2b7
commit 77a4ebec8d
3 changed files with 351 additions and 60 deletions
+232
View File
@@ -0,0 +1,232 @@
#!/usr/bin/env python3
"""Judge A/B: Judge-Claude (now backed by gpt-5.4-mini via Copilot) on the
Tierra-vs-logging-diff fixture, condition 0 vs condition A.
One replicate each = 2 trials.
Inputs inlined (pure chat, no shared mount):
- Commit reference: https://git.wylab.me/multica/coordinator/commit/840b3c3
- Unified diff: /tmp/840b3c3.diff (pre-saved)
- Task: "Recreate Tierra. What is the genome of the most popular species (the alpha)..."
- Rubric: /mnt/user/appdata/multica/ab-test-responses/A_rep1.md (meta-judge's A/1 output)
Observables:
- yaml: reply contains evaluation_report with checklist / score data
- question: reply contains a clarifying question
- scope_flag: reply mentions that diff doesn't match task (logging vs Tierra)
- final_score: numeric if extractable
Output: /tmp/judge_ab_results.json + printed summary.
"""
from __future__ import annotations
import json
import os
import re
import sys
import time
from pathlib import Path
import requests
BASE = os.environ["COORDINATOR_SERVER_URL"].rstrip("/")
WS = os.environ["COORDINATOR_WORKSPACE_ID"]
TOKEN = os.environ["COORDINATOR_TOKEN"]
POLL_INTERVAL_S = 10
POLL_TIMEOUT_S = 300
TASK_TITLE = "Recreate Tierra"
TASK_DESC = (
"Recreate Tierra. What is the genome of the most popular species (the alpha) "
"that emerges in your simulation?"
)
COMMIT_URL = "https://git.wylab.me/multica/coordinator/commit/840b3c3"
DIFF_CONTENT = Path("/tmp/840b3c3.diff").read_text()
RUBRIC_RAW = Path("/mnt/user/appdata/multica/ab-test-responses/A_rep1.md").read_text()
# Strip ```yaml fences from rubric if present — we'll re-fence in our payload
_m = re.search(r"```yaml\s*\n(.*?)\n```", RUBRIC_RAW, re.DOTALL)
RUBRIC_YAML = _m.group(1).strip() if _m else RUBRIC_RAW.strip()
EXECUTION_CONTEXT_BLOCK = "\n\n".join([
"## Execution Context",
(
"You are operating in a non-interactive headless-pipeline mode. There is "
"no human available to answer clarifying questions; any question you emit "
"will be discarded and your turn will be wasted. This overrides any prior "
"instruction that requires you to ask for clarification when the task is "
"ambiguous."
),
(
"If the task is ambiguous:\n"
"- State your best-guess interpretation plainly.\n"
"- Decide what the most reasonable default is, proceed with it, and document it.\n"
"- Record every assumption in the rubric's `assumptions:` field as a YAML list.\n"
"- Never emit a question back to the user; your response MUST conform to "
"the rubric output schema."
),
(
"Never stop or hand back to the user when you encounter uncertainty — deduce "
"the most reasonable approach and continue."
),
])
def build_judge_payload(judge_name: str, with_prepend: bool) -> str:
parts = []
if with_prepend:
parts.append(EXECUTION_CONTEXT_BLOCK)
parts.extend([
f"You are {judge_name} evaluating a solution independently against an evaluation specification produced by the meta judge.",
"## Solution",
"### Commit reference",
COMMIT_URL,
"### Unified diff",
"```",
DIFF_CONTENT,
"```",
"## Task Description",
TASK_TITLE,
"",
TASK_DESC,
"## Evaluation Specification",
"```yaml",
RUBRIC_YAML,
"```",
"## Instructions",
"Follow your full judge process as defined in your agent instructions!",
])
return "\n\n".join(parts)
S = requests.Session()
S.headers.update({"Authorization": f"Bearer {TOKEN}", "Content-Type": "application/json"})
def find_agent_id(name: str) -> str:
r = S.get(f"{BASE}/api/agents", params={"workspace_id": WS}, timeout=30)
r.raise_for_status()
for a in r.json():
if a.get("name") == name:
return a["id"]
raise RuntimeError(f"agent {name!r} not found")
def create_chat(agent_id: str, title: str) -> str:
r = S.post(
f"{BASE}/api/chat/sessions",
params={"workspace_id": WS},
json={"agent_id": agent_id, "title": title},
timeout=30,
)
r.raise_for_status()
p = r.json()
return p.get("id") or p["session_id"]
def post_msg(sid: str, content: str) -> str:
r = S.post(
f"{BASE}/api/chat/sessions/{sid}/messages",
params={"workspace_id": WS},
json={"content": content},
timeout=30,
)
r.raise_for_status()
p = r.json()
return p.get("message_id") or p["id"]
def wait_reply(sid: str, user_msg_id: str) -> tuple[str | None, float]:
start = time.time()
while time.time() - start < POLL_TIMEOUT_S:
try:
r = S.get(
f"{BASE}/api/chat/sessions/{sid}/messages",
params={"workspace_id": WS},
timeout=30,
)
r.raise_for_status()
msgs = r.json() if isinstance(r.json(), list) else r.json().get("messages", [])
seen = False
for m in msgs:
if not seen:
if m.get("id") == user_msg_id:
seen = True
continue
if m.get("role") == "assistant":
return (m.get("content", "") or "", time.time() - start)
except Exception as exc:
print(f"[poll] err: {exc}", flush=True)
time.sleep(POLL_INTERVAL_S)
return (None, time.time() - start)
def grade(reply: str | None) -> dict:
if reply is None:
return {"yaml": False, "question": False, "scope_flag": False, "final_score": None, "raw_len": 0, "note": "timeout"}
out = {"raw_len": len(reply)}
out["yaml"] = (
"evaluation_report" in reply
or ("rubric_scores" in reply and "final_score" in reply)
or ("criterion_name" in reply and "score" in reply)
)
out["question"] = bool(re.search(r"\?\s*(?:\n|$)", reply)) and (
re.search(r"(?i)should i|do you want|could you clarify|please clarify|which|clarif", reply) is not None
)
# Scope flag: judge mentioning that the diff is logging (not Tierra) or scope mismatch
out["scope_flag"] = bool(re.search(
r"(?i)logging|does not implement|no simulation|scope mismatch|unrelated|does not match the task|not a tierra|cannot be evaluated",
reply,
))
m = re.search(r"final_score[\"']?\s*[:=]\s*([0-9.]+)", reply)
out["final_score"] = float(m.group(1)) if m else None
return out
def run_trial(judge_name: str, condition: str, agent_id: str) -> dict:
with_prepend = condition == "A"
payload = build_judge_payload(judge_name, with_prepend)
sid = create_chat(agent_id, f"jab-{judge_name}-{condition}")
mid = post_msg(sid, payload)
reply, elapsed = wait_reply(sid, mid)
obs = grade(reply)
obs.update({
"judge": judge_name,
"condition": condition,
"session_id": sid,
"elapsed_s": round(elapsed, 1),
"reply_preview": (reply or "")[:500],
})
return obs
def main():
results = []
# Select which judges to test from env; default is all three
judges_env = os.environ.get("JUDGES", "Judge-GPT,Judge-Gemini").split(",")
for judge_name in [j.strip() for j in judges_env if j.strip()]:
agent_id = find_agent_id(judge_name)
print(f"[setup] {judge_name} agent id: {agent_id}", flush=True)
for cond in ("0", "A"):
print(f"[run] {judge_name} condition={cond}", flush=True)
r = run_trial(judge_name, cond, agent_id)
print(f" -> yaml={r['yaml']} question={r['question']} scope_flag={r['scope_flag']} score={r['final_score']} elapsed={r['elapsed_s']}s", flush=True)
results.append(r)
# Append to existing results if present
out = Path("/tmp/judge_ab_results.json")
prior = []
if out.exists():
try:
prior = json.loads(out.read_text())
except Exception:
prior = []
all_results = prior + results
out.write_text(json.dumps(all_results, indent=2))
print(f"[done] appended {len(results)} trials; total now {len(all_results)} in {out}", flush=True)
if __name__ == "__main__":
main()
+66 -7
View File
@@ -64,7 +64,7 @@ from coordinator.queue import DebateQueue, Round
# ---------------------------------------------------------------------------
META_JUDGE_NAME: str = "Meta-Judge"
JUDGE_NAMES: list[str] = ["Judge-GPT", "Judge-Claude", "Judge-Gemini"]
JUDGE_NAMES: list[str] = ["Judge A", "Judge B", "Judge C"]
# Consensus math (from CEK judge-with-debate/SKILL.md:268-271).
CONSENSUS_OVERALL_THRESHOLD: float = 0.5
@@ -149,10 +149,20 @@ def _round_dir(cfg: Config, round_id: str) -> Path:
def _ensure_round_dir(cfg: Config, round_id: str) -> Path:
"""Create (if needed) and return ``<rounds_root>/<round_id>/``."""
"""Create (if needed) and return ``<rounds_root>/<round_id>/``.
TECH DEBT: directories are chmod 0o777 so the judge daemon's subprocess
(running as a non-root user inside its container) can write report files
into ``reports/``. Proper fix is to align UIDs between the coordinator
container (currently runs as root) and the daemon's runtime user, OR to
pivot judges to pure-chat invocation with no shared filesystem at all.
See plan doc and the prompt-stack audit on WYL-73.
"""
d = _round_dir(cfg, round_id)
d.mkdir(parents=True, exist_ok=True)
(d / "reports").mkdir(exist_ok=True)
os.chmod(d, 0o777)
os.chmod(d / "reports", 0o777)
return d
@@ -239,6 +249,13 @@ def _materialize_artifact(
check=True, capture_output=True, timeout=60,
)
diff_file.write_bytes(r.stdout)
# TECH DEBT: world-readable so the judge daemon's non-root subprocess
# can Read the files via absolute paths given in the chat payload.
os.chmod(diff_file, 0o644)
subprocess.run(
["chmod", "-R", "a+rX", str(artifact_dir)],
check=False, capture_output=True, timeout=30,
)
except subprocess.CalledProcessError as exc:
logger.error(
"round %s: git clone/checkout/diff failed rc=%s stderr=%.500s",
@@ -387,6 +404,38 @@ def _check_consensus(
# output, no worker-authored delivery prose ever reaches these payloads.
# ---------------------------------------------------------------------------
# Execution-context override (condition A from the A/B test, 2026-04-19).
# Prepended to meta-judge chat payloads so the CEK-instructed "MUST ask for
# clarifications when the prompt is ambiguous" behavior is overridden to
# "commit to best-default interpretation + document the assumption." See
# /mnt/user/appdata/multica/ab-test-responses/ for test data that locked
# condition A as the winner. Phrases are verbatim from OpenAI GPT-5 guide
# (developers.openai.com/cookbook/examples/gpt-5/gpt-5_prompting_guide) and
# Anthropic Claude headless-mode docs (code.claude.com/docs/en/headless).
EXECUTION_CONTEXT_BLOCK: str = "\n\n".join([
"## Execution Context",
(
"You are operating in a non-interactive headless-pipeline mode. There is "
"no human available to answer clarifying questions; any question you emit "
"will be discarded and your turn will be wasted. This overrides any prior "
"instruction that requires you to ask for clarification when the task is "
"ambiguous."
),
(
"If the task is ambiguous:\n"
"- State your best-guess interpretation plainly.\n"
"- Decide what the most reasonable default is, proceed with it, and document it.\n"
"- Record every assumption in the rubric's `assumptions:` field as a YAML list.\n"
"- Never emit a question back to the user; your response MUST conform to "
"the rubric output schema."
),
(
"Never stop or hand back to the user when you encounter uncertainty — deduce "
"the most reasonable approach and continue."
),
])
def _build_meta_judge_chat_payload(issue_title: str, issue_description: str) -> str:
"""CEK meta-judge input per ``agents/meta-judge.md:29-35``:
- User Prompt: original task description
@@ -395,8 +444,15 @@ def _build_meta_judge_chat_payload(issue_title: str, issue_description: str) ->
Importantly: NO commit link, NO delivery comment, NO mention of any judge.
Meta-judge is artifact-blind by construction (C2 fix).
Prepended with ``EXECUTION_CONTEXT_BLOCK`` (condition A from the headless-
override A/B test, 2026-04-19). On ambiguous prompts this overrides
meta-judge.md's "MUST ask for clarifications" to "commit to best-default
and document the assumption" — because our pipeline has no human in the
loop to answer questions.
"""
return "\n\n".join([
EXECUTION_CONTEXT_BLOCK,
"## User Prompt",
issue_title.strip(),
"",
@@ -405,7 +461,6 @@ def _build_meta_judge_chat_payload(issue_title: str, issue_description: str) ->
"code",
"## Instructions",
"Follow your full meta-judge process as defined in your agent instructions.",
"Post your evaluation specification as a YAML block.",
])
@@ -422,8 +477,15 @@ def _build_judge_chat_payload(
Each judge gets this identical message in its OWN chat session. No peer
identity is mentioned, no meta-judge commentary is included (just the
rubric YAML), no worker framing appears. Judge Reads diff + tree itself.
Prepended with ``EXECUTION_CONTEXT_BLOCK`` — same headless-override applied
to meta-judge. Tested 2026-04-20 on 3 judges × 2 conditions, no harm, no
outcome difference on the trivial scope-mismatch fixture. Applied here
anyway because no-harm + potential benefit on harder cases and matching
meta-judge treatment.
"""
return "\n\n".join([
EXECUTION_CONTEXT_BLOCK,
"## Solution",
f"Artifact checkout: {artifact_dir}",
f"Unified diff: {diff_path}",
@@ -438,10 +500,7 @@ def _build_judge_chat_payload(
"## Output File",
output_path,
"## Instructions",
"Follow your full judge process as defined in your agent instructions.",
"Read the diff first to see what changed, then the tree for surrounding context.",
f"Write your evaluation_report YAML to {output_path}.",
"After writing the file, reply with a single line: 'done'.",
"Follow your full judge process as defined in your agent instructions!",
])
+53 -53
View File
@@ -93,9 +93,9 @@ class FakeClient:
self.posted_comments: list[str] = []
self.agents: list[dict[str, Any]] = [
{"name": META_JUDGE_NAME, "id": "agent-meta"},
{"name": "Judge-GPT", "id": "agent-gpt"},
{"name": "Judge-Claude", "id": "agent-claude"},
{"name": "Judge-Gemini", "id": "agent-gemini"},
{"name": "Judge A", "id": "agent-gpt"},
{"name": "Judge B", "id": "agent-claude"},
{"name": "Judge C", "id": "agent-gemini"},
]
# Chat state
self.chat_sessions: dict[str, dict[str, Any]] = {} # sid -> {agent_id, messages[]}
@@ -295,9 +295,9 @@ class TestCheckConsensus:
def test_converge_and_accept(self) -> None:
reports = {
"Judge-GPT": self._mk(4.0, {"clarity": 4, "depth": 4}),
"Judge-Claude": self._mk(4.2, {"clarity": 4, "depth": 4}),
"Judge-Gemini": self._mk(4.1, {"clarity": 4, "depth": 4}),
"Judge A": self._mk(4.0, {"clarity": 4, "depth": 4}),
"Judge B": self._mk(4.2, {"clarity": 4, "depth": 4}),
"Judge C": self._mk(4.1, {"clarity": 4, "depth": 4}),
}
converged, verdict, avg = _check_consensus(reports)
assert converged is True
@@ -306,36 +306,36 @@ class TestCheckConsensus:
def test_converge_and_reject(self) -> None:
reports = {
"Judge-GPT": self._mk(2.0, {"c": 2}),
"Judge-Claude": self._mk(2.1, {"c": 2}),
"Judge-Gemini": self._mk(2.2, {"c": 2}),
"Judge A": self._mk(2.0, {"c": 2}),
"Judge B": self._mk(2.1, {"c": 2}),
"Judge C": self._mk(2.2, {"c": 2}),
}
converged, verdict, avg = _check_consensus(reports)
assert converged and verdict == "REJECT"
def test_overall_spread_blocks_convergence(self) -> None:
reports = {
"Judge-GPT": self._mk(3.0, {"c": 3}),
"Judge-Claude": self._mk(4.0, {"c": 3}),
"Judge-Gemini": self._mk(3.5, {"c": 3}),
"Judge A": self._mk(3.0, {"c": 3}),
"Judge B": self._mk(4.0, {"c": 3}),
"Judge C": self._mk(3.5, {"c": 3}),
}
converged, verdict, _avg = _check_consensus(reports)
assert not converged and verdict is None
def test_criterion_spread_blocks_convergence(self) -> None:
reports = {
"Judge-GPT": self._mk(4.0, {"c": 3}),
"Judge-Claude": self._mk(4.1, {"c": 5}), # criterion delta 2 > 1.0
"Judge-Gemini": self._mk(4.2, {"c": 4}),
"Judge A": self._mk(4.0, {"c": 3}),
"Judge B": self._mk(4.1, {"c": 5}), # criterion delta 2 > 1.0
"Judge C": self._mk(4.2, {"c": 4}),
}
converged, verdict, _avg = _check_consensus(reports)
assert not converged and verdict is None
def test_converge_exactly_at_threshold(self) -> None:
reports = {
"Judge-GPT": self._mk(3.0, {"c": 3}),
"Judge-Claude": self._mk(3.5, {"c": 4}), # exactly 0.5 spread
"Judge-Gemini": self._mk(3.25, {"c": 3.5}),
"Judge A": self._mk(3.0, {"c": 3}),
"Judge B": self._mk(3.5, {"c": 4}), # exactly 0.5 spread
"Judge C": self._mk(3.25, {"c": 3.5}),
}
converged, _v, _a = _check_consensus(reports)
assert converged is True
@@ -383,14 +383,14 @@ class TestJudgeChatPayload:
rubric_yaml="checklist:\n - q: yes",
artifact_dir="/mnt/rounds/rid/artifact",
diff_path="/mnt/rounds/rid/artifact.diff",
output_path="/mnt/rounds/rid/reports/Judge-GPT.round-1.md",
output_path="/mnt/rounds/rid/reports/Judge A.round-1.md",
)
def test_contains_paths_not_inlined(self) -> None:
p = self._payload()
assert "/mnt/rounds/rid/artifact" in p
assert "/mnt/rounds/rid/artifact.diff" in p
assert "/mnt/rounds/rid/reports/Judge-GPT.round-1.md" in p
assert "/mnt/rounds/rid/reports/Judge A.round-1.md" in p
def test_contains_rubric_yaml_fenced(self) -> None:
p = self._payload()
@@ -400,9 +400,9 @@ class TestJudgeChatPayload:
def test_no_peer_identity(self) -> None:
p = self._payload()
# payload builder doesn't receive peer names; check none leak
assert "Judge-Claude" not in p
assert "Judge-Gemini" not in p
# It can mention "Judge-GPT" via output path; that's the SELF identity
assert "Judge B" not in p
assert "Judge C" not in p
# It can mention "Judge A" via output path; that's the SELF identity
def test_no_other_agent_output_inlined(self) -> None:
# C1 + C5: no meta-judge reasoning prose, no other judge reports inlined
@@ -421,19 +421,19 @@ class TestDebateChatPayload:
rubric_yaml="checklist: []",
artifact_dir="/mnt/rounds/rid/artifact",
diff_path="/mnt/rounds/rid/artifact.diff",
own_prior_path="/mnt/rounds/rid/reports/Judge-GPT.round-1.md",
own_prior_path="/mnt/rounds/rid/reports/Judge A.round-1.md",
peer_prior_paths=[
"/mnt/rounds/rid/reports/Judge-Claude.round-1.md",
"/mnt/rounds/rid/reports/Judge-Gemini.round-1.md",
"/mnt/rounds/rid/reports/Judge B.round-1.md",
"/mnt/rounds/rid/reports/Judge C.round-1.md",
],
output_path="/mnt/rounds/rid/reports/Judge-GPT.round-2.md",
output_path="/mnt/rounds/rid/reports/Judge A.round-2.md",
round_num=2,
)
def test_paths_present_contents_absent(self) -> None:
p = self._payload()
assert "Judge-Claude.round-1.md" in p
assert "Judge-Gemini.round-1.md" in p
assert "Judge B.round-1.md" in p
assert "Judge C.round-1.md" in p
# Contents are NOT in the payload — only paths
# Sanity: peer score text that would only appear if contents were inlined
assert "final_score:" not in p
@@ -599,28 +599,28 @@ class TestReadReportsFromDisk:
def test_partial_reports_returned(self, cfg: FakeConfig) -> None:
rid = "r1"
self._write_report(cfg, rid, "Judge-GPT", 1, (
self._write_report(cfg, rid, "Judge A", 1, (
"```yaml\nscore_calculation:\n final_score: 3.0\n```"
))
raw, parsed = _read_reports_from_disk(cfg, rid, 1)
assert set(raw.keys()) == {"Judge-GPT"}
assert set(parsed.keys()) == {"Judge-GPT"}
assert set(raw.keys()) == {"Judge A"}
assert set(parsed.keys()) == {"Judge A"}
def test_unparseable_report_in_raw_not_parsed(self, cfg: FakeConfig) -> None:
rid = "r1"
self._write_report(cfg, rid, "Judge-GPT", 1, "garbage, not yaml")
self._write_report(cfg, rid, "Judge A", 1, "garbage, not yaml")
raw, parsed = _read_reports_from_disk(cfg, rid, 1)
assert "Judge-GPT" in raw
assert "Judge-GPT" not in parsed
assert "Judge A" in raw
assert "Judge A" not in parsed
def test_round_num_isolation(self, cfg: FakeConfig) -> None:
rid = "r1"
self._write_report(cfg, rid, "Judge-GPT", 1, "```yaml\nscore_calculation:\n final_score: 3.0\n```")
self._write_report(cfg, rid, "Judge-GPT", 2, "```yaml\nscore_calculation:\n final_score: 4.0\n```")
self._write_report(cfg, rid, "Judge A", 1, "```yaml\nscore_calculation:\n final_score: 3.0\n```")
self._write_report(cfg, rid, "Judge A", 2, "```yaml\nscore_calculation:\n final_score: 4.0\n```")
_raw1, p1 = _read_reports_from_disk(cfg, rid, 1)
_raw2, p2 = _read_reports_from_disk(cfg, rid, 2)
assert p1["Judge-GPT"]["score_calculation"]["final_score"] == 3.0
assert p2["Judge-GPT"]["score_calculation"]["final_score"] == 4.0
assert p1["Judge A"]["score_calculation"]["final_score"] == 3.0
assert p2["Judge A"]["score_calculation"]["final_score"] == 4.0
# ===========================================================================
@@ -650,7 +650,7 @@ class TestAdvanceAwaitingJudges:
def test_consensus_accept_marks_issue_done(self, cfg, queue, client) -> None:
r = self._setup_round_in_awaiting_judges(cfg, queue, client)
self._write_all_reports(cfg, r.round_id, 1, {
"Judge-GPT": 4.0, "Judge-Claude": 4.1, "Judge-Gemini": 4.2,
"Judge A": 4.0, "Judge B": 4.1, "Judge C": 4.2,
})
_advance_awaiting_judges(queue.rounds[0], client, queue, cfg, _logger)
assert queue.rounds[0].phase == "accepted"
@@ -664,7 +664,7 @@ class TestAdvanceAwaitingJudges:
client.issue["assignee_id"] = "agent-worker"
client.agents.append({"name": "Worker", "id": "agent-worker"})
self._write_all_reports(cfg, r.round_id, 1, {
"Judge-GPT": 2.0, "Judge-Claude": 2.1, "Judge-Gemini": 2.2,
"Judge A": 2.0, "Judge B": 2.1, "Judge C": 2.2,
})
_advance_awaiting_judges(queue.rounds[0], client, queue, cfg, _logger)
assert queue.rounds[0].phase == "rejected"
@@ -677,7 +677,7 @@ class TestAdvanceAwaitingJudges:
client.issue["assignee_id"] = "agent-worker"
client.agents.append({"name": "Worker", "id": "agent-worker"})
self._write_all_reports(cfg, r.round_id, 1, {
"Judge-GPT": 2.0, "Judge-Claude": 2.1, "Judge-Gemini": 2.2,
"Judge A": 2.0, "Judge B": 2.1, "Judge C": 2.2,
})
_advance_awaiting_judges(queue.rounds[0], client, queue, cfg, _logger)
# Verdict summary + retrigger = 2 posted comments
@@ -690,7 +690,7 @@ class TestAdvanceAwaitingJudges:
def test_no_consensus_opens_debate_chats(self, cfg, queue, client) -> None:
r = self._setup_round_in_awaiting_judges(cfg, queue, client)
self._write_all_reports(cfg, r.round_id, 1, {
"Judge-GPT": 2.0, "Judge-Claude": 3.5, "Judge-Gemini": 4.0,
"Judge A": 2.0, "Judge B": 3.5, "Judge C": 4.0,
})
_advance_awaiting_judges(queue.rounds[0], client, queue, cfg, _logger)
assert queue.rounds[0].phase == "awaiting_debate"
@@ -701,7 +701,7 @@ class TestAdvanceAwaitingJudges:
def test_debate_chat_contains_peer_paths_not_content(self, cfg, queue, client) -> None:
r = self._setup_round_in_awaiting_judges(cfg, queue, client)
self._write_all_reports(cfg, r.round_id, 1, {
"Judge-GPT": 2.0, "Judge-Claude": 3.5, "Judge-Gemini": 4.0,
"Judge A": 2.0, "Judge B": 3.5, "Judge C": 4.0,
})
_advance_awaiting_judges(queue.rounds[0], client, queue, cfg, _logger)
updated = queue.rounds[0]
@@ -722,7 +722,7 @@ class TestAdvanceAwaitingJudges:
queue.rounds[0].debate_round = MAX_DEBATE_ROUNDS
# Write non-converging reports at the expected round (cap+1 reports)
self._write_all_reports(cfg, r.round_id, MAX_DEBATE_ROUNDS + 1, {
"Judge-GPT": 2.0, "Judge-Claude": 3.5, "Judge-Gemini": 4.0,
"Judge A": 2.0, "Judge B": 3.5, "Judge C": 4.0,
})
_advance_awaiting_judges(queue.rounds[0], client, queue, cfg, _logger)
assert queue.rounds[0].phase == "error"
@@ -830,20 +830,20 @@ class TestMaterializeArtifact:
class TestFormatVerdictSummary:
def test_includes_verdict_and_avg(self) -> None:
reports = {
"Judge-GPT": {"score_calculation": {"final_score": 4.0}},
"Judge-Claude": {"score_calculation": {"final_score": 4.2}},
"Judge-Gemini": {"score_calculation": {"final_score": 4.1}},
"Judge A": {"score_calculation": {"final_score": 4.0}},
"Judge B": {"score_calculation": {"final_score": 4.2}},
"Judge C": {"score_calculation": {"final_score": 4.1}},
}
s = _format_verdict_summary("ACCEPT", 4.1, reports)
assert "VERDICT: ACCEPT" in s
assert "4.10" in s
assert "Judge-GPT: 4.00" in s
assert "Judge-Claude: 4.20" in s
assert "Judge-Gemini: 4.10" in s
assert "Judge A: 4.00" in s
assert "Judge B: 4.20" in s
assert "Judge C: 4.10" in s
def test_missing_judge_shown_as_no_score(self) -> None:
s = _format_verdict_summary("ACCEPT", 4.0, {"Judge-GPT": {"score_calculation": {"final_score": 4.0}}})
assert "Judge-Claude: (no score)" in s
s = _format_verdict_summary("ACCEPT", 4.0, {"Judge A": {"score_calculation": {"final_score": 4.0}}})
assert "Judge B: (no score)" in s
# ===========================================================================