diff --git a/src/coordinator/orchestrator.py b/src/coordinator/orchestrator.py index 6c185d6..736a2f4 100644 --- a/src/coordinator/orchestrator.py +++ b/src/coordinator/orchestrator.py @@ -419,7 +419,14 @@ def _build_debate_round_comment( round_num: int, prior_reports: dict[str, str], ) -> str: - """Post a debate-round mention comment showing all 3 judges each others' prior reports.""" + """Post a debate-round mention comment showing all 3 judges each others' prior reports. + + Instruction text is taken directly from CEK's judge-with-debate skill + (plugins/sadd/skills/judge-with-debate/SKILL.md, "Additional debate + instructions" and "CRITICAL" blocks). Rewriting this into softer + phrasing produced sycophantic convergence in the first live run; we + stick to CEK's wording. + """ mentions = " ".join( f"[@{name}](mention://agent/{judge_agent_ids[name]})" if judge_agent_ids.get(name) @@ -429,13 +436,30 @@ def _build_debate_round_comment( parts = [ mentions, f"**Debate round {round_num}** — your prior reports disagree beyond the consensus threshold.", - "Each of you: read the others' reports below, re-examine the commit, then post a REVISED `evaluation_report` YAML. You may hold your position if you have new evidence; you may move if you find the other reasoning more grounded. Do not split the difference to compromise.", + "Follow your full judge process as defined in your agent instructions.", + "", + "**Additional debate instructions:**", + "1. Read your previous assessment (your own prior report below).", + "2. Read all other judges' reports (also below).", + "3. Identify disagreements (where your scores differ by >1 point).", + "4. For each major disagreement:", + " - State the disagreement clearly", + " - Defend your position with evidence from the solution and evaluation specification", + " - Challenge the other judge's position with counter-evidence", + " - Consider whether their evidence changes your view", + "5. Post a new reply containing a `## Debate Round {R}` section that APPENDS to your prior report (include your prior report verbatim first, then the new section).", + "6. At the end of your reply, state whether you reached agreement and with which judge. Include your (possibly revised) overall score and per-criterion scores in the same YAML format.", + "", + "**CRITICAL:**", + "- Ground your arguments in the evaluation specification criteria.", + "- **Only revise if you find their evidence compelling.**", + "- **Defend your original scores if you still believe them.**", + "- Quote specific evidence from the solution.", ] for name in JUDGE_NAMES: content = prior_reports.get(name, "*(no prior report)*") parts.append(f"---\n\n### {name} — prior report\n\n{content}") parts.append("---") - parts.append("Post your revised report as a NEW reply. Do not edit prior reports.") return "\n\n".join(parts) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 1dc9a37..87bfc67 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -396,6 +396,28 @@ def test_debate_round_comment_quotes_all_prior_reports(): assert f"REPORT FROM {n}" in body +def test_debate_round_comment_contains_cek_anti_sycophancy_language(): + """Regression: do not soften CEK's critical debate instructions. + + The first live run produced sycophantic convergence because the earlier, + softer phrasing dropped CEK's explicit 'only revise if compelling / defend + original if you still believe them' instructions. Any future edit that + removes these exact clauses should be caught here. + """ + judge_ids = {n: f"id-{n}" for n in JUDGE_NAMES} + body = _build_debate_round_comment(judge_ids, 1, {n: "x" for n in JUDGE_NAMES}) + # CEK's structural instructions + assert "Identify disagreements (where your scores differ by >1 point)" in body + assert "Defend your position with evidence" in body + assert "Challenge the other judge's position with counter-evidence" in body + # CEK's CRITICAL anti-sycophancy list + assert "Only revise if you find their evidence compelling." in body + assert "Defend your original scores if you still believe them." in body + # APPEND not REVISE + assert "APPENDS to your prior report" in body + assert "REVISED" not in body # the old softer phrasing is gone + + def test_retrigger_comment_has_anchor_and_no_drift_instructions(): body = _build_retrigger_comment("Worker", "agent-worker", "Original desc line.", "VERDICT: REJECT", "r1") assert "mention://agent/agent-worker" in body