Fix YAML parser: HTML-unescape content before parse

Multica's comment REST API returns content with HTML entities escaped
(`"` for `"`, `>` for `>`, etc.). Agent replies are plain UTF-8,
so we unescape before extracting and parsing.

Caught by the first live test against WYL-72: Meta-Judge produced a
perfectly valid YAML evaluation specification with CK-001..CK-010 checklist
items, but my parser reported it as malformed because `"` is not a
valid YAML token. The model was fine; the plumbing was wrong.

- src/coordinator/orchestrator.py: _extract_yaml now calls html.unescape first
- tests/test_orchestrator.py: +2 tests covering entity decoding

Tests: 69 passed.
This commit is contained in:
2026-04-18 22:13:31 +02:00
parent f88255096e
commit d1039d01de
2 changed files with 31 additions and 2 deletions
+8 -2
View File
@@ -43,6 +43,7 @@ Race fix invariant
"""
from __future__ import annotations
import html
import logging
import re
from datetime import datetime, timezone
@@ -177,14 +178,19 @@ def _find_reply_by_agent(
def _extract_yaml(content: str) -> str:
"""Pull the YAML block out of a reply. Prefers fenced code blocks.
Multica's comment REST API returns content with HTML entities escaped
(``"`` for ``"``, ``>`` for ``>``, etc.). Agent replies are
plain UTF-8 to begin with, so we unescape first, then extract.
Returns the YAML text (without fences), or the original content if no fence
is found. The caller is responsible for deciding whether the raw content
is parseable.
"""
m = _YAML_FENCE_RE.search(content)
unescaped = html.unescape(content)
m = _YAML_FENCE_RE.search(unescaped)
if m:
return m.group(1).strip()
return content.strip()
return unescaped.strip()
def _parse_rubric(content: str) -> dict[str, Any] | None:
+23
View File
@@ -192,6 +192,29 @@ def test_extract_yaml_from_unfenced_content():
assert y == content.strip()
def test_extract_yaml_unescapes_html_entities():
# Multica REST API returns comment content with `"` as `"`, `>` as `>`, etc.
content = "checklist:\n - id: "CK-001"\n question: "does it work?""
y = _extract_yaml(content)
assert '"CK-001"' in y
assert """ not in y
def test_parse_rubric_accepts_html_encoded_input():
encoded = (
"rrd_cycle_applied: true\n"
"evaluation_specification:\n"
" checklist:\n"
" - id: "CK-001"\n"
" question: "does it work?"\n"
" category: "hard_rule"\n"
" importance: "essential"\n"
)
spec = _parse_rubric(encoded)
assert spec is not None
assert "checklist" in spec
def test_parse_rubric_valid_flat():
spec = _parse_rubric(f"```yaml\n{_rubric_yaml_sample()}\n```")
assert spec is not None