Merge branch 'main' into pr-30

2026-02-04 03:24:31 +00:00
parent 00841309c1 ed809637bc
commit e508f73f54
14 changed files with 328 additions and 40 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,13 @@
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.egg-info
+dist/
+build/
+.git
+.env
+.assets
+node_modules/
+bridge/dist/
+workspace/
--- a/40
+++ b/40
@@ -0,0 +1,40 @@
+FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
+
+# Install Node.js 20 for the WhatsApp bridge
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends curl ca-certificates gnupg git && \
+    mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
+    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends nodejs && \
+    apt-get purge -y gnupg && \
+    apt-get autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Install Python dependencies first (cached layer)
+COPY pyproject.toml README.md LICENSE ./
+RUN mkdir -p nanobot bridge && touch nanobot/__init__.py && \
+    uv pip install --system --no-cache . && \
+    rm -rf nanobot bridge
+
+# Copy the full source and install
+COPY nanobot/ nanobot/
+COPY bridge/ bridge/
+RUN uv pip install --system --no-cache .
+
+# Build the WhatsApp bridge
+WORKDIR /app/bridge
+RUN npm install && npm run build
+WORKDIR /app
+
+# Create config directory
+RUN mkdir -p /root/.nanobot
+
+# Gateway default port
+EXPOSE 18790
+
+ENTRYPOINT ["nanobot"]
+CMD ["status"]
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@
    <img src="https://img.shields.io/badge/license-MIT-green" alt="License">
    <a href="./COMMUNICATION.md"><img src="https://img.shields.io/badge/Feishu-Group-E9DBFC?style=flat&logo=feishu&logoColor=white" alt="Feishu"></a>
    <a href="./COMMUNICATION.md"><img src="https://img.shields.io/badge/WeChat-Group-C5EAB4?style=flat&logo=wechat&logoColor=white" alt="WeChat"></a>
+    <a href="https://discord.gg/MnCvHqpUGB"><img src="https://img.shields.io/badge/Discord-Community-5865F2?style=flat&logo=discord&logoColor=white" alt="Discord"></a>
  </p>
 </div>

@@ -17,7 +18,7 @@

 ## 📢 News

- **2025-02-01** 🎉 nanobot launched! Welcome to try 🐈 nanobot!
+- **2026-02-01** 🎉 nanobot launched! Welcome to try 🐈 nanobot!

 ## Key Features of nanobot:

@@ -60,13 +61,7 @@

 ## 📦 Install

-**Install from PyPi**
-
-```bash
-pip install nanobot-ai
-```
-
-**Install from source** (recommended for development)
+**Install from source** (latest features, recommended for development)

 ```bash
 git clone https://github.com/HKUDS/nanobot.git
@@ -74,12 +69,16 @@ cd nanobot
 pip install -e .
 ```

-**Install with uv**
+**Install with [uv](https://github.com/astral-sh/uv)** (stable, fast)

 ```bash
-uv venv
-source .venv/bin/activate
-uv pip install nanobot-ai
+uv tool install nanobot-ai
+```
+
+**Install from PyPI** (stable)
+
+```bash
+pip install nanobot-ai
 ```

 ## 🚀 Quick Start
@@ -241,6 +240,22 @@ nanobot gateway

 ## ⚙️ Configuration

+Config file: `~/.nanobot/config.json`
+
+### Providers
+
+> [!NOTE]
+> Groq provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed.
+
+| Provider | Purpose | Get API Key |
+|----------|---------|-------------|
+| `openrouter` | LLM (recommended, access to all models) | [openrouter.ai](https://openrouter.ai) |
+| `anthropic` | LLM (Claude direct) | [console.anthropic.com](https://console.anthropic.com) |
+| `openai` | LLM (GPT direct) | [platform.openai.com](https://platform.openai.com) |
+| `groq` | LLM + **Voice transcription** (Whisper) | [console.groq.com](https://console.groq.com) |
+| `gemini` | LLM (Gemini direct) | [aistudio.google.com](https://aistudio.google.com) |
+
+
 <details>
 <summary><b>Full config example</b></summary>

@@ -254,6 +269,9 @@ nanobot gateway
  "providers": {
    "openrouter": {
      "apiKey": "sk-or-v1-xxx"
+    },
+    "groq": {
+      "apiKey": "gsk_xxx"
    }
  },
  "channels": {
@@ -307,6 +325,31 @@ nanobot cron remove <job_id>

 </details>

+## 🐳 Docker
+
+> [!TIP]
+> The `-v ~/.nanobot:/root/.nanobot` flag mounts your local config directory into the container, so your config and workspace persist across container restarts.
+
+Build and run nanobot in a container:
+
+```bash
+# Build the image
+docker build -t nanobot .
+
+# Initialize config (first time only)
+docker run -v ~/.nanobot:/root/.nanobot --rm nanobot onboard
+
+# Edit config on host to add API keys
+vim ~/.nanobot/config.json
+
+# Run gateway (connects to Telegram/WhatsApp)
+docker run -v ~/.nanobot:/root/.nanobot -p 18790:18790 nanobot gateway
+
+# Or run a single command
+docker run -v ~/.nanobot:/root/.nanobot --rm nanobot agent -m "Hello!"
+docker run -v ~/.nanobot:/root/.nanobot --rm nanobot status
+```
+
 ## 📁 Project Structure

 ```
@@ -335,6 +378,7 @@ PRs welcome! The codebase is intentionally small and readable. 🤗

 **Roadmap** — Pick an item and [open a PR](https://github.com/HKUDS/nanobot/pulls)!

+- [x] **Voice Transcription** — Support for Groq Whisper (Issue #13)
 - [ ] **Multi-modal** — See and hear (images, voice, video)
 - [ ] **Long-term memory** — Never forget important context
 - [ ] **Better reasoning** — Multi-step planning and reflection
@@ -347,7 +391,6 @@ PRs welcome! The codebase is intentionally small and readable. 🤗
  <img src="https://contrib.rocks/image?repo=HKUDS/nanobot" />
 </a>

---

 ## ⭐ Star History

@@ -365,3 +408,8 @@ PRs welcome! The codebase is intentionally small and readable. 🤗
  <em> Thanks for visiting ✨ nanobot!</em><br><br>
  <img src="https://visitor-badge.laobi.icu/badge?page_id=HKUDS.nanobot&style=for-the-badge&color=00d4ff" alt="Views">
 </p>
+
+
+<p align="center">
+  <sub>nanobot is for educational, research, and technical exchange purposes only</sub>
+</p>
--- a/bridge/src/whatsapp.ts
+++ b/bridge/src/whatsapp.ts
@@ -160,6 +160,11 @@ export class WhatsAppClient {
      return `[Document] ${message.documentMessage.caption}`;
    }

+    // Voice/Audio message
+    if (message.audioMessage) {
+      return `[Voice Message]`;
+    }
+
    return null;
  }

--- a/nanobot/agent/tools/web.py
+++ b/nanobot/agent/tools/web.py
@@ -5,6 +5,7 @@ import json
 import os
 import re
 from typing import Any
+from urllib.parse import urlparse

 import httpx

@@ -12,6 +13,7 @@ from nanobot.agent.tools.base import Tool

 # Shared constants
 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36"
+MAX_REDIRECTS = 5  # Limit redirects to prevent DoS attacks


 def _strip_tags(text: str) -> str:
@@ -28,6 +30,19 @@ def _normalize(text: str) -> str:
    return re.sub(r'\n{3,}', '\n\n', text).strip()


+def _validate_url(url: str) -> tuple[bool, str]:
+    """Validate URL: must be http(s) with valid domain."""
+    try:
+        p = urlparse(url)
+        if p.scheme not in ('http', 'https'):
+            return False, f"Only http/https allowed, got '{p.scheme or 'none'}'"
+        if not p.netloc:
+            return False, "Missing domain"
+        return True, ""
+    except Exception as e:
+        return False, str(e)
+
+
 class WebSearchTool(Tool):
    """Search the web using Brave Search API."""
    
@@ -95,12 +110,21 @@ class WebFetchTool(Tool):
    
    async def execute(self, url: str, extractMode: str = "markdown", maxChars: int | None = None, **kwargs: Any) -> str:
        from readability import Document
-        
+
        max_chars = maxChars or self.max_chars
-        
+
+        # Validate URL before fetching
+        is_valid, error_msg = _validate_url(url)
+        if not is_valid:
+            return json.dumps({"error": f"URL validation failed: {error_msg}", "url": url})
+
        try:
-            async with httpx.AsyncClient() as client:
-                r = await client.get(url, headers={"User-Agent": USER_AGENT}, follow_redirects=True, timeout=30.0)
+            async with httpx.AsyncClient(
+                follow_redirects=True,
+                max_redirects=MAX_REDIRECTS,
+                timeout=30.0
+            ) as client:
+                r = await client.get(url, headers={"User-Agent": USER_AGENT})
                r.raise_for_status()
            
            ctype = r.headers.get("content-type", "")
--- a/nanobot/channels/manager.py
+++ b/nanobot/channels/manager.py
@@ -37,7 +37,9 @@ class ChannelManager:
            try:
                from nanobot.channels.telegram import TelegramChannel
                self.channels["telegram"] = TelegramChannel(
-                    self.config.channels.telegram, self.bus
+                    self.config.channels.telegram,
+                    self.bus,
+                    groq_api_key=self.config.providers.groq.api_key,
                )
                logger.info("Telegram channel enabled")
            except ImportError as e:
--- a/nanobot/channels/telegram.py
+++ b/nanobot/channels/telegram.py
@@ -85,9 +85,10 @@ class TelegramChannel(BaseChannel):
    
    name = "telegram"
    
-    def __init__(self, config: TelegramConfig, bus: MessageBus):
+    def __init__(self, config: TelegramConfig, bus: MessageBus, groq_api_key: str = ""):
        super().__init__(config, bus)
        self.config: TelegramConfig = config
+        self.groq_api_key = groq_api_key
        self._app: Application | None = None
        self._chat_ids: dict[str, int] = {}  # Map sender_id to chat_id for replies
    
@@ -249,7 +250,20 @@ class TelegramChannel(BaseChannel):
                await file.download_to_drive(str(file_path))
                
                media_paths.append(str(file_path))
-                content_parts.append(f"[{media_type}: {file_path}]")
+                
+                # Handle voice transcription
+                if media_type == "voice" or media_type == "audio":
+                    from nanobot.providers.transcription import GroqTranscriptionProvider
+                    transcriber = GroqTranscriptionProvider(api_key=self.groq_api_key)
+                    transcription = await transcriber.transcribe(file_path)
+                    if transcription:
+                        logger.info(f"Transcribed {media_type}: {transcription[:50]}...")
+                        content_parts.append(f"[transcription: {transcription}]")
+                    else:
+                        content_parts.append(f"[{media_type}: {file_path}]")
+                else:
+                    content_parts.append(f"[{media_type}: {file_path}]")
+                    
                logger.debug(f"Downloaded {media_type} to {file_path}")
            except Exception as e:
                logger.error(f"Failed to download media: {e}")
--- a/nanobot/channels/whatsapp.py
+++ b/nanobot/channels/whatsapp.py
@@ -107,6 +107,11 @@ class WhatsAppChannel(BaseChannel):
            # Extract just the phone number as chat_id
            chat_id = sender.split("@")[0] if "@" in sender else sender
            
+            # Handle voice transcription if it's a voice message
+            if content == "[Voice Message]":
+                logger.info(f"Voice message received from {chat_id}, but direct download from bridge is not yet supported.")
+                content = "[Voice Message: Transcription not available for WhatsApp yet]"
+            
            await self._handle_message(
                sender_id=chat_id,
                chat_id=sender,  # Use full JID for replies
--- a/nanobot/cli/commands.py
+++ b/nanobot/cli/commands.py
@@ -178,11 +178,13 @@ def gateway(
    # Create components
    bus = MessageBus()
    
-    # Create provider (supports OpenRouter, Anthropic, OpenAI)
+    # Create provider (supports OpenRouter, Anthropic, OpenAI, Bedrock)
    api_key = config.get_api_key()
    api_base = config.get_api_base()
-    
-    if not api_key:
+    model = config.agents.defaults.model
+    is_bedrock = model.startswith("bedrock/")
+
+    if not api_key and not is_bedrock:
        console.print("[red]Error: No API key configured.[/red]")
        console.print("Set one in ~/.nanobot/config.json under providers.openrouter.apiKey")
        raise typer.Exit(1)
@@ -289,11 +291,13 @@ def agent(
    
    api_key = config.get_api_key()
    api_base = config.get_api_base()
-    
-    if not api_key:
+    model = config.agents.defaults.model
+    is_bedrock = model.startswith("bedrock/")
+
+    if not api_key and not is_bedrock:
        console.print("[red]Error: No API key configured.[/red]")
        raise typer.Exit(1)
-    
+
    bus = MessageBus()
    provider = LiteLLMProvider(
        api_key=api_key,
@@ -348,21 +352,31 @@ app.add_typer(channels_app, name="channels")
 def channels_status():
    """Show channel status."""
    from nanobot.config.loader import load_config
-    
+
    config = load_config()
-    
+
    table = Table(title="Channel Status")
    table.add_column("Channel", style="cyan")
    table.add_column("Enabled", style="green")
-    table.add_column("Bridge URL", style="yellow")
-    
+    table.add_column("Configuration", style="yellow")
+
+    # WhatsApp
    wa = config.channels.whatsapp
    table.add_row(
        "WhatsApp",
        "✓" if wa.enabled else "✗",
        wa.bridge_url
    )
-    
+
+    # Telegram
+    tg = config.channels.telegram
+    tg_config = f"token: {tg.token[:10]}..." if tg.token else "[dim]not configured[/dim]"
+    table.add_row(
+        "Telegram",
+        "✓" if tg.enabled else "✗",
+        tg_config
+    )
+
    console.print(table)


@@ -608,18 +622,17 @@ def cron_run(
 def status():
    """Show nanobot status."""
    from nanobot.config.loader import load_config, get_config_path
-    from nanobot.utils.helpers import get_workspace_path
-    
+
    config_path = get_config_path()
-    workspace = get_workspace_path()
-    
+    config = load_config()
+    workspace = config.workspace_path
+
    console.print(f"{__logo__} nanobot Status\n")
-    
+
    console.print(f"Config: {config_path} {'[green]✓[/green]' if config_path.exists() else '[red]✗[/red]'}")
    console.print(f"Workspace: {workspace} {'[green]✓[/green]' if workspace.exists() else '[red]✗[/red]'}")
-    
+
    if config_path.exists():
-        config = load_config()
        console.print(f"Model: {config.agents.defaults.model}")
        
        # Check API keys
--- a/nanobot/config/schema.py
+++ b/nanobot/config/schema.py
@@ -50,6 +50,7 @@ class ProvidersConfig(BaseModel):
    anthropic: ProviderConfig = Field(default_factory=ProviderConfig)
    openai: ProviderConfig = Field(default_factory=ProviderConfig)
    openrouter: ProviderConfig = Field(default_factory=ProviderConfig)
+    groq: ProviderConfig = Field(default_factory=ProviderConfig)
    zhipu: ProviderConfig = Field(default_factory=ProviderConfig)
    vllm: ProviderConfig = Field(default_factory=ProviderConfig)
    gemini: ProviderConfig = Field(default_factory=ProviderConfig)
@@ -91,13 +92,14 @@ class Config(BaseSettings):
        return Path(self.agents.defaults.workspace).expanduser()
    
    def get_api_key(self) -> str | None:
-        """Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > Zhipu > vLLM."""
+        """Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > Zhipu > Groq > vLLM."""
        return (
            self.providers.openrouter.api_key or
            self.providers.anthropic.api_key or
            self.providers.openai.api_key or
            self.providers.gemini.api_key or
            self.providers.zhipu.api_key or
+            self.providers.groq.api_key or
            self.providers.vllm.api_key or
            None
        )
--- a/nanobot/heartbeat/service.py
+++ b/nanobot/heartbeat/service.py
@@ -115,7 +115,7 @@ class HeartbeatService:
                response = await self.on_heartbeat(HEARTBEAT_PROMPT)
                
                # Check if agent said "nothing to do"
-                if HEARTBEAT_OK_TOKEN in response.upper().replace("_", ""):
+                if HEARTBEAT_OK_TOKEN.replace("_", "") in response.upper().replace("_", ""):
                    logger.info("Heartbeat: OK (no action needed)")
                else:
                    logger.info(f"Heartbeat: completed task")
--- a/nanobot/providers/litellm_provider.py
+++ b/nanobot/providers/litellm_provider.py
@@ -51,6 +51,8 @@ class LiteLLMProvider(LLMProvider):
                os.environ.setdefault("GEMINI_API_KEY", api_key)
            elif "zhipu" in default_model or "glm" in default_model or "zai" in default_model:
                os.environ.setdefault("ZHIPUAI_API_KEY", api_key)
+            elif "groq" in default_model:
+                os.environ.setdefault("GROQ_API_KEY", api_key)
        
        if api_base:
            litellm.api_base = api_base
--- a/nanobot/providers/transcription.py
+++ b/nanobot/providers/transcription.py
@@ -0,0 +1,65 @@
+"""Voice transcription provider using Groq."""
+
+import os
+from pathlib import Path
+from typing import Any
+
+import httpx
+from loguru import logger
+
+
+class GroqTranscriptionProvider:
+    """
+    Voice transcription provider using Groq's Whisper API.
+    
+    Groq offers extremely fast transcription with a generous free tier.
+    """
+    
+    def __init__(self, api_key: str | None = None):
+        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
+        self.api_url = "https://api.groq.com/openai/v1/audio/transcriptions"
+    
+    async def transcribe(self, file_path: str | Path) -> str:
+        """
+        Transcribe an audio file using Groq.
+        
+        Args:
+            file_path: Path to the audio file.
+            
+        Returns:
+            Transcribed text.
+        """
+        if not self.api_key:
+            logger.warning("Groq API key not configured for transcription")
+            return ""
+        
+        path = Path(file_path)
+        if not path.exists():
+            logger.error(f"Audio file not found: {file_path}")
+            return ""
+        
+        try:
+            async with httpx.AsyncClient() as client:
+                with open(path, "rb") as f:
+                    files = {
+                        "file": (path.name, f),
+                        "model": (None, "whisper-large-v3"),
+                    }
+                    headers = {
+                        "Authorization": f"Bearer {self.api_key}",
+                    }
+                    
+                    response = await client.post(
+                        self.api_url,
+                        headers=headers,
+                        files=files,
+                        timeout=60.0
+                    )
+                    
+                    response.raise_for_status()
+                    data = response.json()
+                    return data.get("text", "")
+                    
+        except Exception as e:
+            logger.error(f"Groq transcription error: {e}")
+            return ""
--- a/test_docker.sh
+++ b/test_docker.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+IMAGE_NAME="nanobot-test"
+
+echo "=== Building Docker image ==="
+docker build -t "$IMAGE_NAME" .
+
+echo ""
+echo "=== Running 'nanobot onboard' ==="
+docker run --name nanobot-test-run "$IMAGE_NAME" onboard
+
+echo ""
+echo "=== Running 'nanobot status' ==="
+STATUS_OUTPUT=$(docker commit nanobot-test-run nanobot-test-onboarded > /dev/null && \
+    docker run --rm nanobot-test-onboarded status 2>&1) || true
+
+echo "$STATUS_OUTPUT"
+
+echo ""
+echo "=== Validating output ==="
+PASS=true
+
+check() {
+    if echo "$STATUS_OUTPUT" | grep -q "$1"; then
+        echo "  PASS: found '$1'"
+    else
+        echo "  FAIL: missing '$1'"
+        PASS=false
+    fi
+}
+
+check "nanobot Status"
+check "Config:"
+check "Workspace:"
+check "Model:"
+check "OpenRouter API:"
+check "Anthropic API:"
+check "OpenAI API:"
+
+echo ""
+if $PASS; then
+    echo "=== All checks passed ==="
+else
+    echo "=== Some checks FAILED ==="
+    exit 1
+fi
+
+# Cleanup
+echo ""
+echo "=== Cleanup ==="
+docker rm -f nanobot-test-run 2>/dev/null || true
+docker rmi -f nanobot-test-onboarded 2>/dev/null || true
+docker rmi -f "$IMAGE_NAME" 2>/dev/null || true
+echo "Done."