Replace message chunking with upstream's proven implementation
- Switch from sentence-boundary splitting to upstream's simpler approach - Uses max_len=4000 (safer buffer vs 4096 limit) - Split priority: line breaks → spaces → hard cut - Battle-tested implementation from HKUDS/nanobot upstream - Simpler, more maintainable code - Works better for both prose and code/logs Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -218,70 +218,43 @@ class TelegramChannel(BaseChannel):
|
||||
except Exception as e2:
|
||||
logger.error(f"Error sending Telegram message: {e2}")
|
||||
|
||||
@staticmethod
|
||||
def _split_message(content: str, max_len: int = 4000) -> list[str]:
|
||||
"""Split content into chunks within max_len, preferring line breaks.
|
||||
|
||||
From upstream HKUDS/nanobot - battle-tested implementation.
|
||||
Uses 4000 char limit (safer than 4096) with split priority: \n → space → hard cut.
|
||||
"""
|
||||
if len(content) <= max_len:
|
||||
return [content]
|
||||
chunks: list[str] = []
|
||||
while content:
|
||||
if len(content) <= max_len:
|
||||
chunks.append(content)
|
||||
break
|
||||
cut = content[:max_len]
|
||||
pos = cut.rfind('\n')
|
||||
if pos == -1:
|
||||
pos = cut.rfind(' ')
|
||||
if pos == -1:
|
||||
pos = max_len
|
||||
chunks.append(content[:pos])
|
||||
content = content[pos:].lstrip()
|
||||
return chunks
|
||||
|
||||
async def _send_text_chunks(
|
||||
self,
|
||||
chat_id: int,
|
||||
text: str,
|
||||
parse_mode: str | None = "HTML"
|
||||
) -> None:
|
||||
"""Split and send long messages at sentence boundaries.
|
||||
"""Split and send long messages.
|
||||
|
||||
Telegram has a 4096 character limit per message.
|
||||
Per design doc: split at sentence boundaries, send as multiple messages.
|
||||
Uses upstream's proven implementation - splits at line breaks, then spaces.
|
||||
"""
|
||||
MAX_LENGTH = 4096
|
||||
chunks = self._split_message(text)
|
||||
|
||||
if len(text) <= MAX_LENGTH:
|
||||
# Single message
|
||||
await self._app.bot.send_message(
|
||||
chat_id=chat_id,
|
||||
text=text,
|
||||
parse_mode=parse_mode
|
||||
)
|
||||
return
|
||||
|
||||
# Split at sentence boundaries
|
||||
import re
|
||||
# Split on sentence endings: . ! ? followed by space/newline/end
|
||||
sentences = re.split(r'([.!?]+(?:\s+|$))', text)
|
||||
|
||||
# Rejoin sentence with its punctuation
|
||||
parts = []
|
||||
for i in range(0, len(sentences) - 1, 2):
|
||||
parts.append(sentences[i] + (sentences[i+1] if i+1 < len(sentences) else ''))
|
||||
if len(sentences) % 2 == 1: # Last part without punctuation
|
||||
parts.append(sentences[-1])
|
||||
|
||||
# Group into chunks under MAX_LENGTH
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for part in parts:
|
||||
# If single part exceeds limit, force split it
|
||||
if len(part) > MAX_LENGTH:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
current_chunk = ""
|
||||
# Hard split at MAX_LENGTH
|
||||
for i in range(0, len(part), MAX_LENGTH):
|
||||
chunks.append(part[i:i + MAX_LENGTH])
|
||||
continue
|
||||
|
||||
# Try adding part to current chunk
|
||||
test_chunk = current_chunk + part
|
||||
if len(test_chunk) > MAX_LENGTH:
|
||||
# Save current chunk, start new one
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
current_chunk = part
|
||||
else:
|
||||
current_chunk = test_chunk
|
||||
|
||||
# Add final chunk
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
# Send all chunks as separate messages
|
||||
for chunk in chunks:
|
||||
await self._app.bot.send_message(
|
||||
chat_id=chat_id,
|
||||
|
||||
@@ -92,8 +92,8 @@ async def test_long_message_splits_at_sentences():
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_message_at_exactly_4096_chars():
|
||||
"""Message at exactly 4096 chars should not chunk."""
|
||||
async def test_message_at_exactly_4000_chars():
|
||||
"""Message at exactly 4000 chars should not chunk (safer limit)."""
|
||||
config = MagicMock()
|
||||
config.token = "test-token"
|
||||
bus = MagicMock()
|
||||
@@ -112,8 +112,8 @@ async def test_message_at_exactly_4096_chars():
|
||||
|
||||
channel._app = MockApp()
|
||||
|
||||
# Exactly 4096 chars
|
||||
content = "A" * 4096
|
||||
# Exactly 4000 chars (upstream uses 4000 as safer limit vs 4096)
|
||||
content = "A" * 4000
|
||||
|
||||
msg = OutboundMessage(
|
||||
channel="telegram",
|
||||
|
||||
Reference in New Issue
Block a user