From 54255c89c45cafda1a6aea017f12723dc3909abe Mon Sep 17 00:00:00 2001
From: code-server <code-server@wylab.me>
Date: Sat, 28 Feb 2026 03:47:08 +0000
Subject: [PATCH] Replace message chunking with upstream's proven
 implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Switch from sentence-boundary splitting to upstream's simpler approach
- Uses max_len=4000 (safer buffer vs 4096 limit)
- Split priority: line breaks → spaces → hard cut
- Battle-tested implementation from HKUDS/nanobot upstream
- Simpler, more maintainable code
- Works better for both prose and code/logs

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 nanobot/channels/telegram.py    | 81 +++++++++++----------------------
 tests/test_telegram_chunking.py |  8 ++--
 2 files changed, 31 insertions(+), 58 deletions(-)

diff --git a/nanobot/channels/telegram.py b/nanobot/channels/telegram.py
index b5e6dc2..99e012b 100644
--- a/nanobot/channels/telegram.py
+++ b/nanobot/channels/telegram.py
@@ -218,70 +218,43 @@ class TelegramChannel(BaseChannel):
             except Exception as e2:
                 logger.error(f"Error sending Telegram message: {e2}")
 
+    @staticmethod
+    def _split_message(content: str, max_len: int = 4000) -> list[str]:
+        """Split content into chunks within max_len, preferring line breaks.
+
+        From upstream HKUDS/nanobot - battle-tested implementation.
+        Uses 4000 char limit (safer than 4096) with split priority: \n → space → hard cut.
+        """
+        if len(content) <= max_len:
+            return [content]
+        chunks: list[str] = []
+        while content:
+            if len(content) <= max_len:
+                chunks.append(content)
+                break
+            cut = content[:max_len]
+            pos = cut.rfind('\n')
+            if pos == -1:
+                pos = cut.rfind(' ')
+            if pos == -1:
+                pos = max_len
+            chunks.append(content[:pos])
+            content = content[pos:].lstrip()
+        return chunks
+
     async def _send_text_chunks(
         self,
         chat_id: int,
         text: str,
         parse_mode: str | None = "HTML"
     ) -> None:
-        """Split and send long messages at sentence boundaries.
+        """Split and send long messages.
 
         Telegram has a 4096 character limit per message.
-        Per design doc: split at sentence boundaries, send as multiple messages.
+        Uses upstream's proven implementation - splits at line breaks, then spaces.
         """
-        MAX_LENGTH = 4096
+        chunks = self._split_message(text)
 
-        if len(text) <= MAX_LENGTH:
-            # Single message
-            await self._app.bot.send_message(
-                chat_id=chat_id,
-                text=text,
-                parse_mode=parse_mode
-            )
-            return
-
-        # Split at sentence boundaries
-        import re
-        # Split on sentence endings: . ! ? followed by space/newline/end
-        sentences = re.split(r'([.!?]+(?:\s+|$))', text)
-
-        # Rejoin sentence with its punctuation
-        parts = []
-        for i in range(0, len(sentences) - 1, 2):
-            parts.append(sentences[i] + (sentences[i+1] if i+1 < len(sentences) else ''))
-        if len(sentences) % 2 == 1:  # Last part without punctuation
-            parts.append(sentences[-1])
-
-        # Group into chunks under MAX_LENGTH
-        chunks = []
-        current_chunk = ""
-
-        for part in parts:
-            # If single part exceeds limit, force split it
-            if len(part) > MAX_LENGTH:
-                if current_chunk:
-                    chunks.append(current_chunk)
-                    current_chunk = ""
-                # Hard split at MAX_LENGTH
-                for i in range(0, len(part), MAX_LENGTH):
-                    chunks.append(part[i:i + MAX_LENGTH])
-                continue
-
-            # Try adding part to current chunk
-            test_chunk = current_chunk + part
-            if len(test_chunk) > MAX_LENGTH:
-                # Save current chunk, start new one
-                if current_chunk:
-                    chunks.append(current_chunk)
-                current_chunk = part
-            else:
-                current_chunk = test_chunk
-
-        # Add final chunk
-        if current_chunk:
-            chunks.append(current_chunk)
-
-        # Send all chunks as separate messages
         for chunk in chunks:
             await self._app.bot.send_message(
                 chat_id=chat_id,
diff --git a/tests/test_telegram_chunking.py b/tests/test_telegram_chunking.py
index ce27825..82e05c4 100644
--- a/tests/test_telegram_chunking.py
+++ b/tests/test_telegram_chunking.py
@@ -92,8 +92,8 @@ async def test_long_message_splits_at_sentences():
 
 
 @pytest.mark.asyncio
-async def test_message_at_exactly_4096_chars():
-    """Message at exactly 4096 chars should not chunk."""
+async def test_message_at_exactly_4000_chars():
+    """Message at exactly 4000 chars should not chunk (safer limit)."""
     config = MagicMock()
     config.token = "test-token"
     bus = MagicMock()
@@ -112,8 +112,8 @@ async def test_message_at_exactly_4096_chars():
 
     channel._app = MockApp()
 
-    # Exactly 4096 chars
-    content = "A" * 4096
+    # Exactly 4000 chars (upstream uses 4000 as safer limit vs 4096)
+    content = "A" * 4000
 
     msg = OutboundMessage(
         channel="telegram",