From 54255c89c45cafda1a6aea017f12723dc3909abe Mon Sep 17 00:00:00 2001 From: code-server Date: Sat, 28 Feb 2026 03:47:08 +0000 Subject: [PATCH] Replace message chunking with upstream's proven implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Switch from sentence-boundary splitting to upstream's simpler approach - Uses max_len=4000 (safer buffer vs 4096 limit) - Split priority: line breaks → spaces → hard cut - Battle-tested implementation from HKUDS/nanobot upstream - Simpler, more maintainable code - Works better for both prose and code/logs Co-Authored-By: Claude Sonnet 4.5 --- nanobot/channels/telegram.py | 81 +++++++++++---------------------- tests/test_telegram_chunking.py | 8 ++-- 2 files changed, 31 insertions(+), 58 deletions(-) diff --git a/nanobot/channels/telegram.py b/nanobot/channels/telegram.py index b5e6dc2..99e012b 100644 --- a/nanobot/channels/telegram.py +++ b/nanobot/channels/telegram.py @@ -218,70 +218,43 @@ class TelegramChannel(BaseChannel): except Exception as e2: logger.error(f"Error sending Telegram message: {e2}") + @staticmethod + def _split_message(content: str, max_len: int = 4000) -> list[str]: + """Split content into chunks within max_len, preferring line breaks. + + From upstream HKUDS/nanobot - battle-tested implementation. + Uses 4000 char limit (safer than 4096) with split priority: \n → space → hard cut. + """ + if len(content) <= max_len: + return [content] + chunks: list[str] = [] + while content: + if len(content) <= max_len: + chunks.append(content) + break + cut = content[:max_len] + pos = cut.rfind('\n') + if pos == -1: + pos = cut.rfind(' ') + if pos == -1: + pos = max_len + chunks.append(content[:pos]) + content = content[pos:].lstrip() + return chunks + async def _send_text_chunks( self, chat_id: int, text: str, parse_mode: str | None = "HTML" ) -> None: - """Split and send long messages at sentence boundaries. + """Split and send long messages. Telegram has a 4096 character limit per message. - Per design doc: split at sentence boundaries, send as multiple messages. + Uses upstream's proven implementation - splits at line breaks, then spaces. """ - MAX_LENGTH = 4096 + chunks = self._split_message(text) - if len(text) <= MAX_LENGTH: - # Single message - await self._app.bot.send_message( - chat_id=chat_id, - text=text, - parse_mode=parse_mode - ) - return - - # Split at sentence boundaries - import re - # Split on sentence endings: . ! ? followed by space/newline/end - sentences = re.split(r'([.!?]+(?:\s+|$))', text) - - # Rejoin sentence with its punctuation - parts = [] - for i in range(0, len(sentences) - 1, 2): - parts.append(sentences[i] + (sentences[i+1] if i+1 < len(sentences) else '')) - if len(sentences) % 2 == 1: # Last part without punctuation - parts.append(sentences[-1]) - - # Group into chunks under MAX_LENGTH - chunks = [] - current_chunk = "" - - for part in parts: - # If single part exceeds limit, force split it - if len(part) > MAX_LENGTH: - if current_chunk: - chunks.append(current_chunk) - current_chunk = "" - # Hard split at MAX_LENGTH - for i in range(0, len(part), MAX_LENGTH): - chunks.append(part[i:i + MAX_LENGTH]) - continue - - # Try adding part to current chunk - test_chunk = current_chunk + part - if len(test_chunk) > MAX_LENGTH: - # Save current chunk, start new one - if current_chunk: - chunks.append(current_chunk) - current_chunk = part - else: - current_chunk = test_chunk - - # Add final chunk - if current_chunk: - chunks.append(current_chunk) - - # Send all chunks as separate messages for chunk in chunks: await self._app.bot.send_message( chat_id=chat_id, diff --git a/tests/test_telegram_chunking.py b/tests/test_telegram_chunking.py index ce27825..82e05c4 100644 --- a/tests/test_telegram_chunking.py +++ b/tests/test_telegram_chunking.py @@ -92,8 +92,8 @@ async def test_long_message_splits_at_sentences(): @pytest.mark.asyncio -async def test_message_at_exactly_4096_chars(): - """Message at exactly 4096 chars should not chunk.""" +async def test_message_at_exactly_4000_chars(): + """Message at exactly 4000 chars should not chunk (safer limit).""" config = MagicMock() config.token = "test-token" bus = MagicMock() @@ -112,8 +112,8 @@ async def test_message_at_exactly_4096_chars(): channel._app = MockApp() - # Exactly 4096 chars - content = "A" * 4096 + # Exactly 4000 chars (upstream uses 4000 as safer limit vs 4096) + content = "A" * 4000 msg = OutboundMessage( channel="telegram",