feat: sign intermediate messages so model knows what user didn't see
Build Nanobot OAuth / build (pull_request) Successful in 6m14s
Build Nanobot OAuth / cleanup (pull_request) Has been skipped

Intermediate assistant messages (with tool_calls) and tool result messages
are never sent to the user but remain in the model's context. This causes
the model to refer to content the user never saw.

Add _hidden_sig field at message creation time (context.py), then apply
[HIDDEN:sig] prefix at read time (session get_history) so the model sees
which messages were hidden. Storing the signature separately from content
preserves Anthropic prompt caching — the same prefixed string is produced
every turn.

Changes:
- visibility.py: add compute_signature(), refactor sign_content/verify to
  use it, fix Tuple -> tuple (PEP 585)
- context.py: add_assistant_message() and add_tool_result() store _hidden_sig
- session/manager.py: get_history() applies [HIDDEN:sig] prefix at read time
- tests/test_message_visibility.py: 14 tests covering compute_signature,
  _hidden_sig creation, get_history prefix, JSONL round-trip, idempotency

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-09 15:23:46 +01:00
parent d90c3b4a24
commit 5569c99b8e
3 changed files with 156 additions and 14 deletions
-1
View File
@@ -260,7 +260,6 @@ visibility markers will be rejected."""
if tool_calls:
msg["tool_calls"] = tool_calls
# Mark as hidden: user never sees intermediate assistant messages
msg["_hidden_sig"] = compute_signature(content or "")
# Thinking models reject history without this
+3 -13
View File
@@ -4,7 +4,6 @@
import hmac
import hashlib
import re
from typing import Tuple
SECRET_KEY = "nanobot_visibility_secret_key_v1"
@@ -28,15 +27,11 @@ def sign_content(content: str) -> str:
Returns:
Content with signed visibility marker: "[HIDDEN:{sig}] {content}"
"""
sig = hmac.new(
SECRET_KEY.encode(),
content.encode(),
hashlib.sha256
).hexdigest()[:8]
sig = compute_signature(content)
return f"[HIDDEN:{sig}] {content}"
def verify_signature(marked_content: str) -> Tuple[bool, str]:
def verify_signature(marked_content: str) -> tuple[bool, str]:
"""
Verify HMAC signature and extract clean content.
@@ -53,12 +48,7 @@ def verify_signature(marked_content: str) -> Tuple[bool, str]:
return False, marked_content
claimed_sig, content = match.groups()
expected_sig = hmac.new(
SECRET_KEY.encode(),
content.encode(),
hashlib.sha256
).hexdigest()[:8]
expected_sig = compute_signature(content)
is_valid = hmac.compare_digest(claimed_sig, expected_sig)
return is_valid, content
+153
View File
@@ -0,0 +1,153 @@
"""Tests for message visibility signing (hidden intermediate messages)."""
import json
from pathlib import Path
from nanobot.agent.context import ContextBuilder
from nanobot.agent.visibility import compute_signature, sign_content
from nanobot.session.manager import Session
class TestComputeSignature:
"""Tests for compute_signature()."""
def test_returns_8_char_hex(self):
sig = compute_signature("hello")
assert len(sig) == 8
assert all(c in "0123456789abcdef" for c in sig)
def test_deterministic(self):
assert compute_signature("hello") == compute_signature("hello")
def test_different_content_different_sig(self):
assert compute_signature("hello") != compute_signature("world")
def test_sign_content_uses_compute_signature(self):
"""sign_content should produce [HIDDEN:{compute_signature(content)}] prefix."""
content = "test message"
sig = compute_signature(content)
assert sign_content(content) == f"[HIDDEN:{sig}] {content}"
class TestAddAssistantMessage:
"""Tests for _hidden_sig in add_assistant_message()."""
def setup_method(self):
self.ctx = ContextBuilder(Path("/tmp"))
def test_intermediate_message_gets_hidden_sig(self):
msgs: list = []
tool_calls = [{"id": "tc1", "type": "function", "function": {"name": "test", "arguments": "{}"}}]
self.ctx.add_assistant_message(msgs, "thinking...", tool_calls)
assert msgs[0].get("_hidden_sig") is not None
assert msgs[0]["_hidden_sig"] == compute_signature("thinking...")
def test_final_message_no_hidden_sig(self):
msgs: list = []
self.ctx.add_assistant_message(msgs, "Here is the answer", None)
assert "_hidden_sig" not in msgs[0]
def test_empty_content_signed(self):
msgs: list = []
tool_calls = [{"id": "tc1", "type": "function", "function": {"name": "test", "arguments": "{}"}}]
self.ctx.add_assistant_message(msgs, None, tool_calls)
assert msgs[0]["_hidden_sig"] == compute_signature("")
class TestAddToolResult:
"""Tests for _hidden_sig in add_tool_result()."""
def setup_method(self):
self.ctx = ContextBuilder(Path("/tmp"))
def test_tool_result_gets_hidden_sig(self):
msgs: list = []
self.ctx.add_tool_result(msgs, "tc1", "read_file", "file contents here")
assert msgs[0]["_hidden_sig"] == compute_signature("file contents here")
def test_tool_result_non_string_content(self):
msgs: list = []
# Multipart content (e.g. image) is a list, not a string
self.ctx.add_tool_result(msgs, "tc1", "screenshot", [{"type": "text", "text": "ok"}])
assert msgs[0]["_hidden_sig"] == compute_signature("")
class TestGetHistoryPrefix:
"""Tests for get_history() applying [HIDDEN:sig] prefix."""
def test_hidden_sig_applied_at_read_time(self):
session = Session(key="test")
sig = compute_signature("thinking...")
session.messages = [
{"role": "assistant", "content": "thinking...", "tool_calls": [{}], "_hidden_sig": sig},
]
history = session.get_history()
assert history[0]["content"] == f"[HIDDEN:{sig}] thinking..."
assert "_hidden_sig" not in history[0]
def test_no_prefix_without_hidden_sig(self):
session = Session(key="test")
session.messages = [
{"role": "assistant", "content": "Here is the answer"},
]
history = session.get_history()
assert history[0]["content"] == "Here is the answer"
def test_tool_result_gets_prefix(self):
session = Session(key="test")
sig = compute_signature("file contents")
session.messages = [
{"role": "tool", "tool_call_id": "tc1", "name": "read", "content": "file contents", "_hidden_sig": sig},
]
history = session.get_history()
assert history[0]["content"] == f"[HIDDEN:{sig}] file contents"
def test_roundtrip_jsonl(self, tmp_path):
"""Write to session JSONL, reload, verify get_history() produces correct prefix."""
from nanobot.session.manager import SessionManager
workspace = tmp_path / "workspace"
workspace.mkdir()
mgr = SessionManager(workspace)
session = mgr.get_or_create("test:roundtrip")
sig = compute_signature("intermediate")
session.add_raw_message({
"role": "assistant",
"content": "intermediate",
"tool_calls": [{"id": "tc1", "type": "function", "function": {"name": "x", "arguments": "{}"}}],
"_hidden_sig": sig,
})
session.add_raw_message({
"role": "assistant",
"content": "final answer",
})
mgr.save(session)
# Reload from disk
mgr.invalidate("test:roundtrip")
reloaded = mgr.get_or_create("test:roundtrip")
history = reloaded.get_history()
assert history[0]["content"] == f"[HIDDEN:{sig}] intermediate"
assert history[1]["content"] == "final answer"
def test_idempotent_across_calls(self):
"""Same prefix produced every call (cache stability)."""
session = Session(key="test")
sig = compute_signature("msg")
session.messages = [
{"role": "assistant", "content": "msg", "_hidden_sig": sig},
]
h1 = session.get_history()
h2 = session.get_history()
assert h1[0]["content"] == h2[0]["content"]