Fix YAML parser: HTML-unescape content before parse
Multica's comment REST API returns content with HTML entities escaped (`"` for `"`, `>` for `>`, etc.). Agent replies are plain UTF-8, so we unescape before extracting and parsing. Caught by the first live test against WYL-72: Meta-Judge produced a perfectly valid YAML evaluation specification with CK-001..CK-010 checklist items, but my parser reported it as malformed because `"` is not a valid YAML token. The model was fine; the plumbing was wrong. - src/coordinator/orchestrator.py: _extract_yaml now calls html.unescape first - tests/test_orchestrator.py: +2 tests covering entity decoding Tests: 69 passed.
This commit is contained in:
@@ -43,6 +43,7 @@ Race fix invariant
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
@@ -177,14 +178,19 @@ def _find_reply_by_agent(
|
||||
def _extract_yaml(content: str) -> str:
|
||||
"""Pull the YAML block out of a reply. Prefers fenced code blocks.
|
||||
|
||||
Multica's comment REST API returns content with HTML entities escaped
|
||||
(``"`` for ``"``, ``>`` for ``>``, etc.). Agent replies are
|
||||
plain UTF-8 to begin with, so we unescape first, then extract.
|
||||
|
||||
Returns the YAML text (without fences), or the original content if no fence
|
||||
is found. The caller is responsible for deciding whether the raw content
|
||||
is parseable.
|
||||
"""
|
||||
m = _YAML_FENCE_RE.search(content)
|
||||
unescaped = html.unescape(content)
|
||||
m = _YAML_FENCE_RE.search(unescaped)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return content.strip()
|
||||
return unescaped.strip()
|
||||
|
||||
|
||||
def _parse_rubric(content: str) -> dict[str, Any] | None:
|
||||
|
||||
@@ -192,6 +192,29 @@ def test_extract_yaml_from_unfenced_content():
|
||||
assert y == content.strip()
|
||||
|
||||
|
||||
def test_extract_yaml_unescapes_html_entities():
|
||||
# Multica REST API returns comment content with `"` as `"`, `>` as `>`, etc.
|
||||
content = "checklist:\n - id: "CK-001"\n question: "does it work?""
|
||||
y = _extract_yaml(content)
|
||||
assert '"CK-001"' in y
|
||||
assert """ not in y
|
||||
|
||||
|
||||
def test_parse_rubric_accepts_html_encoded_input():
|
||||
encoded = (
|
||||
"rrd_cycle_applied: true\n"
|
||||
"evaluation_specification:\n"
|
||||
" checklist:\n"
|
||||
" - id: "CK-001"\n"
|
||||
" question: "does it work?"\n"
|
||||
" category: "hard_rule"\n"
|
||||
" importance: "essential"\n"
|
||||
)
|
||||
spec = _parse_rubric(encoded)
|
||||
assert spec is not None
|
||||
assert "checklist" in spec
|
||||
|
||||
|
||||
def test_parse_rubric_valid_flat():
|
||||
spec = _parse_rubric(f"```yaml\n{_rubric_yaml_sample()}\n```")
|
||||
assert spec is not None
|
||||
|
||||
Reference in New Issue
Block a user