54255c89c4
- Switch from sentence-boundary splitting to upstream's simpler approach - Uses max_len=4000 (safer buffer vs 4096 limit) - Split priority: line breaks → spaces → hard cut - Battle-tested implementation from HKUDS/nanobot upstream - Simpler, more maintainable code - Works better for both prose and code/logs Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
171 lines
4.3 KiB
Python
171 lines
4.3 KiB
Python
"""Tests for Telegram message chunking.
|
|
|
|
Per design doc: messages >4096 chars should split at sentence boundaries.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
from nanobot.bus.events import OutboundMessage
|
|
from nanobot.channels.telegram import TelegramChannel
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_short_message_not_chunked():
|
|
"""Messages under 4096 chars should send as single message."""
|
|
config = MagicMock()
|
|
config.token = "test-token"
|
|
bus = MagicMock()
|
|
|
|
channel = TelegramChannel(config, bus)
|
|
|
|
# Mock the bot
|
|
sent_messages = []
|
|
|
|
class MockBot:
|
|
async def send_message(self, chat_id, text, parse_mode=None):
|
|
sent_messages.append({"chat_id": chat_id, "text": text})
|
|
|
|
class MockApp:
|
|
bot = MockBot()
|
|
|
|
channel._app = MockApp()
|
|
|
|
# Short message
|
|
msg = OutboundMessage(
|
|
channel="telegram",
|
|
chat_id="123",
|
|
content="Short message."
|
|
)
|
|
|
|
await channel.send(msg)
|
|
|
|
# Should send exactly 1 message
|
|
assert len(sent_messages) == 1
|
|
assert sent_messages[0]["text"] == "Short message."
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_long_message_splits_at_sentences():
|
|
"""Messages >4096 chars should split at sentence boundaries."""
|
|
config = MagicMock()
|
|
config.token = "test-token"
|
|
bus = MagicMock()
|
|
|
|
channel = TelegramChannel(config, bus)
|
|
|
|
# Mock the bot
|
|
sent_messages = []
|
|
|
|
class MockBot:
|
|
async def send_message(self, chat_id, text, parse_mode=None):
|
|
sent_messages.append({"chat_id": chat_id, "text": text})
|
|
|
|
class MockApp:
|
|
bot = MockBot()
|
|
|
|
channel._app = MockApp()
|
|
|
|
# Create a message longer than 4096 chars with clear sentence boundaries
|
|
# Each sentence is 200 chars, need 21+ sentences to exceed 4096
|
|
sentence = "A" * 195 + "end. " # 200 chars including "end. "
|
|
long_content = sentence * 25 # 5000 chars total
|
|
|
|
msg = OutboundMessage(
|
|
channel="telegram",
|
|
chat_id="123",
|
|
content=long_content
|
|
)
|
|
|
|
await channel.send(msg)
|
|
|
|
# Should split into multiple messages
|
|
assert len(sent_messages) > 1
|
|
|
|
# Each message should be under 4096 chars
|
|
for sent in sent_messages:
|
|
assert len(sent["text"]) <= 4096
|
|
|
|
# All messages combined should equal original (with whitespace trimming)
|
|
combined = "".join(sent["text"] for sent in sent_messages)
|
|
assert combined.replace(" ", "") == long_content.replace(" ", "")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_message_at_exactly_4000_chars():
|
|
"""Message at exactly 4000 chars should not chunk (safer limit)."""
|
|
config = MagicMock()
|
|
config.token = "test-token"
|
|
bus = MagicMock()
|
|
|
|
channel = TelegramChannel(config, bus)
|
|
|
|
# Mock the bot
|
|
sent_messages = []
|
|
|
|
class MockBot:
|
|
async def send_message(self, chat_id, text, parse_mode=None):
|
|
sent_messages.append({"chat_id": chat_id, "text": text})
|
|
|
|
class MockApp:
|
|
bot = MockBot()
|
|
|
|
channel._app = MockApp()
|
|
|
|
# Exactly 4000 chars (upstream uses 4000 as safer limit vs 4096)
|
|
content = "A" * 4000
|
|
|
|
msg = OutboundMessage(
|
|
channel="telegram",
|
|
chat_id="123",
|
|
content=content
|
|
)
|
|
|
|
await channel.send(msg)
|
|
|
|
# Should send exactly 1 message
|
|
assert len(sent_messages) == 1
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_message_preserves_sentence_boundaries():
|
|
"""Chunks should split at sentence endings, not mid-sentence."""
|
|
config = MagicMock()
|
|
config.token = "test-token"
|
|
bus = MagicMock()
|
|
|
|
channel = TelegramChannel(config, bus)
|
|
|
|
# Mock the bot
|
|
sent_messages = []
|
|
|
|
class MockBot:
|
|
async def send_message(self, chat_id, text, parse_mode=None):
|
|
sent_messages.append({"chat_id": chat_id, "text": text})
|
|
|
|
class MockApp:
|
|
bot = MockBot()
|
|
|
|
channel._app = MockApp()
|
|
|
|
# Create content with clear sentence markers
|
|
# First part: just under 4096 chars
|
|
part1 = "First sentence. " * 250 # ~4000 chars
|
|
part2 = "Second sentence. "
|
|
content = part1 + part2
|
|
|
|
msg = OutboundMessage(
|
|
channel="telegram",
|
|
chat_id="123",
|
|
content=content
|
|
)
|
|
|
|
await channel.send(msg)
|
|
|
|
# Verify chunks don't break mid-sentence
|
|
for sent in sent_messages:
|
|
text = sent["text"].strip()
|
|
# Each chunk should end with sentence punctuation
|
|
if text:
|
|
assert text[-1] in ".!?"
|