mtmd: add batching API (#24384 )

* mtmd: add batching API * wip * first working version (gemma4v) * add arg * nits * wire up support_batch() * fix 0.0 output embd * fix audio * nits * refactor a bit * nits * fix non-batching case * fix comment
ci : unbreak release harder (#24545 )
2026-06-13 01:06:45 +02:00 · 2026-06-13 00:10:29 +02:00 · 2026-06-12 23:49:36 +02:00 · 2026-06-12 23:29:49 +03:00 · 2026-06-12 17:59:56 +03:00 · 2026-06-12 16:43:50 +02:00
70 changed files with 9443 additions and 1621 deletions
@@ -59,8 +59,31 @@ jobs:
            echo "should_release=false" >> $GITHUB_OUTPUT
          fi

+  get-version:
+    runs-on: ubuntu-slim
+    outputs:
+      ui_version: ${{ steps.version.outputs.ui_version }}
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - id: version
+        run: |
+          # Resolve UI version: BUILD_NUMBER from cmake/build-info.cmake > git hash + epoch > fallback
+          version=""
+          if grep -q "BUILD_NUMBER" cmake/build-info.cmake; then
+            build_number=$(grep "set(BUILD_NUMBER" cmake/build-info.cmake | grep -oP '\d+')
+            if [ -n "$build_number" ] && [ "$build_number" -gt 0 ]; then
+              version="b${build_number}"
+            fi
+          fi
+          if [ -z "$version" ]; then
+            version=$(git rev-parse --short HEAD)-$(date +%s)
+          fi
+          echo "ui_version=${version}" >> $GITHUB_OUTPUT
+
  macos-cpu:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    strategy:
      matrix:
@@ -116,6 +139,7 @@ jobs:
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

@@ -141,7 +165,7 @@ jobs:
          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    strategy:
      matrix:
@@ -201,6 +225,7 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -227,7 +252,7 @@ jobs:
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-vulkan:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    strategy:
@@ -287,6 +312,7 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DGGML_VULKAN=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -312,7 +338,7 @@ jobs:
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  android-arm64:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-latest
@@ -379,6 +405,7 @@ jobs:
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -404,7 +431,7 @@ jobs:
          name: llama-bin-android-arm64.tar.gz

  ubuntu-24-openvino:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-24.04
@@ -476,7 +503,8 @@ jobs:
          source ./openvino_toolkit/setupvars.sh
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
+            -DGGML_OPENVINO=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
          cmake --build build/ReleaseOV --config Release -j $(nproc)

      - name: ccache-clear
@@ -952,7 +980,7 @@ jobs:
          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz

  ubuntu-22-rocm:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-22.04
@@ -1044,6 +1072,7 @@ jobs:
            -DGGML_HIP=ON \
            -DHIP_PLATFORM=amd \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -1072,7 +1101,7 @@ jobs:
          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: windows-2022
@@ -1168,6 +1197,7 @@ jobs:
            -DGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} `
            -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
@@ -1195,7 +1225,7 @@ jobs:
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip

  ios-xcode:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    runs-on: macos-26

@@ -1224,7 +1254,8 @@ jobs:
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

      - name: xcodebuild for swift package
@@ -1344,10 +1375,12 @@ jobs:
 #          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 #          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz

-  ui:
-    needs: [check-release]
+  ui-build:
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    uses: ./.github/workflows/ui-build.yml
+    with:
+      hf_ui_version: ${{ needs.get-version.outputs.ui_version }}

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -1360,6 +1393,7 @@ jobs:
    runs-on: ubuntu-slim

    needs:
+      - get-version
      - windows
      - windows-cpu
      - windows-cuda
@@ -1374,7 +1408,7 @@ jobs:
      - macos-cpu
      - ios-xcode
      #- openEuler-cann
-      - ui
+      - ui-build

    outputs:
      tag_name: ${{ steps.tag.outputs.name }}
@@ -2,6 +2,11 @@ name: UI Build

 on:
  workflow_call:
+    inputs:
+      hf_ui_version:
+        description: 'Version string for version.json (e.g. 12345)'
+        required: false
+        type: string

 jobs:
  build:
@@ -25,9 +30,16 @@ jobs:
        working-directory: tools/ui

      - name: Build application
+        env:
+          HF_UI_VERSION: ${{ inputs.hf_ui_version || '' }}
+          LLAMA_UI_VERSION: ${{ inputs.hf_ui_version || 'b0000' }}
        run: npm run build
        working-directory: tools/ui

+      - name: Run PWA unit tests (versioned build output)
+        run: npx vitest --project=unit --run tests/unit/pwa.spec.ts
+        working-directory: tools/ui
+
      - name: Generate checksums
        run: |
          cd tools/ui/dist
@@ -1,8 +1,8 @@
 name: UI (self-hosted)

 # these are the same as ui.yml, but with self-hosted runners
-# the runners come with pre-installed Playwright browsers version: 1.56.1
-# the jobs are much lighter because they don't need to install node and playwright browsers
+# the jobs are lighter because they don't need to install Node.js or Playwright browsers
+# the runner has pre-installed Playwright browsers for @playwright/test (1.56.1) at /ms-playwright/

 on:
  workflow_dispatch:
@@ -61,6 +61,12 @@ jobs:
        run: npm ci
        working-directory: tools/ui

+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+
      - name: Run type checking
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run check
@@ -72,12 +78,12 @@ jobs:
        working-directory: tools/ui

      - name: Run Client tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:client
        working-directory: tools/ui

      - name: Run Unit tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:unit
        working-directory: tools/ui

@@ -97,22 +103,23 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/ui
+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/

      - name: Build Storybook
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run build-storybook
        working-directory: tools/ui

      - name: Run UI tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/ui

      - name: Run E2E tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/ui
@@ -43,7 +43,7 @@ jobs:
  ui-checks:
    name: Checks
    needs: ui-build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -60,6 +60,12 @@ jobs:
          cache: "npm"
          cache-dependency-path: "tools/ui/package-lock.json"

+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+
      - name: Install dependencies
        id: setup
        if: ${{ steps.node.conclusion == 'success' }}
@@ -87,7 +93,7 @@ jobs:
        run: npm run test:client
        working-directory: tools/ui

-      - name: Run Unit tests
+      - name: Run Unit tests (uses pre-built dist/ from ui-build)
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:unit
        working-directory: tools/ui
@@ -95,7 +101,7 @@ jobs:
  e2e-tests:
    name: E2E Tests
    needs: ui-build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
@@ -117,10 +123,11 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/ui
+      - name: Download built UI artifacts (reuses ui-build)
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/

      - name: Install Playwright browsers
        id: playwright
@@ -138,7 +145,7 @@ jobs:
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/ui

-      - name: Run E2E tests
+      - name: Run E2E tests (uses pre-built dist/ from ui-build)
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/ui
@@ -92,13 +92,6 @@
 !/examples/sycl/*.bat
 !/examples/sycl/*.sh

-# Server Web UI temporary files (+ legacy directory)
-
-/tools/server/webui/node_modules
-/tools/server/webui/dist
-/tools/ui/node_modules
-/tools/ui/dist
-
 # Python

 /.venv
@@ -1,6 +1,6 @@
 # llama.cpp

-![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
+![llama](https://raw.githubusercontent.com/ggml-org/llama.brand/refs/heads/master/cover/llama-cpp/cover-llama-cpp-dark.svg)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
@@ -2243,6 +2243,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.image_max_tokens = value;
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
+    add_opt(common_arg(
+        {"--mtmd-batch-max-tokens"}, "N",
+        string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
+        [](common_params & params, int value) {
+            params.mtmd_batch_max_tokens = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
@@ -575,6 +575,7 @@ struct common_params {
    std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
    int image_min_tokens = -1;
    int image_max_tokens = -1;
+    int mtmd_batch_max_tokens = 1024;

    // finetune
    struct lr_opt lr;
@@ -1,9 +1,7 @@
 #pragma once

 #include "ggml.h"
-#include "ggml-backend.h"
 #include "llama.h"
-#include "../src/llama-ext.h"

 #include <vector>

@@ -18,31 +16,35 @@ enum common_params_fit_status {
 //   - this function is NOT thread safe because it modifies the global llama logger state
 //   - only parameters that have the same value as in llama_default_model_params are modified
 //     with the exception of the context size which is modified if and only if equal to 0
-enum common_params_fit_status common_fit_params(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams,
-                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                     size_t * margins,               // margins of memory to leave per device in bytes
-                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+common_params_fit_status common_fit_params(
+                         const char * path_model,
+                 llama_model_params * mparams,
+               llama_context_params * cparams,
+                              float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+   llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                             size_t * margins,               // margins of memory to leave per device in bytes
+                           uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                     ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log

 // print estimated memory to stdout
 void common_fit_print(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams);
+                         const char * path_model,
+                 llama_model_params * mparams,
+               llama_context_params * cparams);

-void common_memory_breakdown_print(const struct llama_context * ctx);
+void common_memory_breakdown_print(const llama_context * ctx);
+
+// TODO: convert this to common_device_memory_data that wraps llama_device_memory_data
+//       add API for accessing the internal `llama-ext.h` information
+struct llama_device_memory_data;

 // Load a model + context with no_alloc and return the per-device memory breakdown.
 std::vector<llama_device_memory_data> common_get_device_memory_data(
-                                  const char   * path_model,
-        const struct llama_model_params         * mparams,
-        const struct llama_context_params       * cparams,
-        std::vector<ggml_backend_dev_t>         & devs,
-                                      uint32_t  & hp_ngl,
-                                      uint32_t  & hp_n_ctx_train,
-                                      uint32_t  & hp_n_expert,
-                           enum ggml_log_level    log_level);
+                         const char * path_model,
+           const llama_model_params * mparams,
+         const llama_context_params * cparams,
+    std::vector<ggml_backend_dev_t> & devs,
+                           uint32_t & hp_ngl,
+                           uint32_t & hp_n_ctx_train,
+                           uint32_t & hp_n_expert,
+                     ggml_log_level   log_level);
@@ -7741,6 +7741,23 @@ static void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, si
    if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
        GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);

+        std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
+        vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool);
+        ggml_vk_ctx_begin(src->device, subctx);
+        subctx->s->buffer->buf.pipelineBarrier(
+            vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer,
+            vk::PipelineStageFlagBits::eHost,
+            {},
+            { { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite,
+                vk::AccessFlagBits::eHostRead } },
+            {}, {});
+        ggml_vk_ctx_end(subctx);
+        ggml_vk_submit(subctx, src->device->fence);
+        VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX),
+                 "vk_buffer_read_2d uma waitForFences");
+        src->device->device.resetFences({ src->device->fence });
+        ggml_vk_queue_command_pools_cleanup(src->device);
+
        if (width == spitch && width == dpitch) {
            memcpy(dst, (const uint8_t *) src->ptr + offset, width * height);
        } else {
@@ -16,11 +16,80 @@ set(HF_ENABLED        "" CACHE STRING "Whether to allow HF Bucket download (ON/O
 set(BUILD_UI          "" CACHE STRING "Build UI via npm (ON/OFF)")
 set(LLAMA_UI_EMBED    "" CACHE STRING "Path to llama-ui-embed helper")

+# IMPORTANT: When adding PWA assets, sync across all 3 places:
+#   1. tools/ui/src/lib/constants/pwa.ts   (APPLE_DEVICES, PUBLIC_ENDPOINTS)
+#   2. tools/server/server-http.cpp        (public_endpoints)
+#   3. scripts/ui-assets.cmake             (ASSETS list)
+# - C++ (server-http.cpp) - public endpoints (splash screens generated via helper)
+# - TypeScript (constants/pwa.ts) - APPLE_DEVICES, PWA_MANIFEST, PUBLIC_ENDPOINTS
+#
+# When adding/changing PWA assets, update tools/ui/src/lib/constants/pwa.ts first,
+# then sync any new file names here and in server-http.cpp.
 set(ASSETS
-    bundle.css
-    bundle.js
    index.html
    loading.html
+    # PWA assets
+    favicon.ico
+    favicon-dark.ico
+    favicon.svg
+    favicon-dark.svg
+    pwa-64x64.png
+    pwa-192x192.png
+    pwa-512x512.png
+    maskable-icon-512x512.png
+    apple-touch-icon-180x180.png
+    # iOS splash screens
+    apple-splash-portrait-640x1136.png
+    apple-splash-landscape-1136x640.png
+    apple-splash-portrait-750x1334.png
+    apple-splash-landscape-1334x750.png
+    apple-splash-portrait-1170x2532.png
+    apple-splash-landscape-2532x1170.png
+    apple-splash-portrait-1179x2556.png
+    apple-splash-landscape-2556x1179.png
+    apple-splash-portrait-1206x2622.png
+    apple-splash-landscape-2622x1206.png
+    apple-splash-portrait-1284x2778.png
+    apple-splash-landscape-2778x1284.png
+    apple-splash-portrait-1290x2796.png
+    apple-splash-landscape-2796x1290.png
+    apple-splash-portrait-1320x2868.png
+    apple-splash-landscape-2868x1320.png
+    apple-splash-portrait-1488x2266.png
+    apple-splash-landscape-2266x1488.png
+    apple-splash-portrait-1640x2360.png
+    apple-splash-landscape-2360x1640.png
+    apple-splash-portrait-1668x2388.png
+    apple-splash-landscape-2388x1668.png
+    apple-splash-portrait-2048x2732.png
+    apple-splash-landscape-2732x2048.png
+    # iOS dark splash screens
+    apple-splash-portrait-dark-640x1136.png
+    apple-splash-landscape-dark-1136x640.png
+    apple-splash-portrait-dark-750x1334.png
+    apple-splash-landscape-dark-1334x750.png
+    apple-splash-portrait-dark-1170x2532.png
+    apple-splash-landscape-dark-2532x1170.png
+    apple-splash-portrait-dark-1179x2556.png
+    apple-splash-landscape-dark-2556x1179.png
+    apple-splash-portrait-dark-1206x2622.png
+    apple-splash-landscape-dark-2622x1206.png
+    apple-splash-portrait-dark-1284x2778.png
+    apple-splash-landscape-dark-2778x1284.png
+    apple-splash-portrait-dark-1290x2796.png
+    apple-splash-landscape-dark-2796x1290.png
+    apple-splash-portrait-dark-1320x2868.png
+    apple-splash-landscape-dark-2868x1320.png
+    apple-splash-portrait-dark-1640x2360.png
+    apple-splash-landscape-dark-2360x1640.png
+    apple-splash-portrait-dark-1668x2388.png
+    apple-splash-landscape-dark-2388x1668.png
+    apple-splash-portrait-dark-2048x2732.png
+    apple-splash-landscape-dark-2732x2048.png
+    manifest.webmanifest
+    sw.js
+    _app/version.json
+    build.json
 )

 set(DIST_DIR     "${UI_BINARY_DIR}/dist")
@@ -159,7 +228,7 @@ function(npm_build out_var)

    message(STATUS "UI: running npm run build, output -> ${DIST_DIR}")
    execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}"
+        COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}" "LLAMA_UI_VERSION=${HF_VERSION}" "LLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}"
                ${NPM_EXECUTABLE} run build
        WORKING_DIRECTORY "${UI_SOURCE_DIR}"
        RESULT_VARIABLE rc
@@ -274,8 +343,35 @@ function(emit_files)
        foreach(asset ${ASSETS})
            list(APPEND args "${asset}" "${DIST_DIR}/${asset}")
        endforeach()
+
+        # Bundle files live in _app/immutable/ — vanilla SvelteKit output, no plugin
+        # rewriting. Embedded names must match the exact _app/ paths that index.html
+        # and sw.js reference.
+        file(GLOB_RECURSE detected_bundle_js "${DIST_DIR}/_app/immutable/bundle.*.js")
+        file(GLOB_RECURSE detected_bundle_css "${DIST_DIR}/_app/immutable/assets/bundle.*.css")
+        file(GLOB_RECURSE detected_workbox "${DIST_DIR}/workbox-*.js")
+        # Compute relative path from DIST_DIR to each found file.
+        # e.g. /path/to/build/tools/ui/dist/_app/immutable/bundle.XXX.js
+        #      -> _app/immutable/bundle.XXX.js
+        foreach(f ${detected_bundle_js})
+            string(REPLACE "${DIST_DIR}/" "" rel "${f}")
+            list(APPEND args "${rel}" "${f}")
+        endforeach()
+        foreach(f ${detected_bundle_css})
+            string(REPLACE "${DIST_DIR}/" "" rel "${f}")
+            list(APPEND args "${rel}" "${f}")
+        endforeach()
+        foreach(f ${detected_workbox})
+            string(REPLACE "${DIST_DIR}/" "" rel "${f}")
+            list(APPEND args "${rel}" "${f}")
+        endforeach()
    endif()

+    # Create build.json with the llama.cpp build number for UI version display.
+    # This is separate from SvelteKit's _app/version.json (used for SW cache invalidation).
+    # build.json is generated by the vite plugin (buildInfoPlugin) during npm build.
+    # CMake just embeds it from the dist that npm produced.
+
    execute_process(
        COMMAND "${LLAMA_UI_EMBED}" ${args}
        RESULT_VARIABLE rc
@@ -300,6 +396,8 @@ endif()
 set(provisioned FALSE)

 if(BUILD_UI)
+    # Resolve version from git build-info if not explicitly set
+    resolve_version(HF_VERSION)
    npm_build(NPM_OK)
    if(NPM_OK)
        set(provisioned TRUE)
@@ -2,6 +2,7 @@

 // this is a staging header for new llama.cpp API
 // breaking changes and C++ are allowed. everything here should be considered WIP
+// try as much as possible to not include this header in the rest of the codebase

 #include "llama.h"

@@ -54,6 +54,10 @@ struct clip_graph {
    virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
    // TODO: build_mm(w, b, x) to support bias

+    virtual bool support_batch() const {
+        return false;
+    }
+
    //
    // utility functions
    //
@@ -171,6 +171,8 @@ struct clip_ctx {
    std::map<ggml_backend_dev_t, size_t> mem_usage;
    std::map<ggml_backend_dev_t, size_t> mem_compute;

+    bool support_batch = false;
+
    clip_ctx(clip_context_params & ctx_params) {
        flash_attn_type = ctx_params.flash_attn_type;
        no_alloc = ctx_params.no_alloc;
@@ -314,7 +316,7 @@ ggml_tensor * clip_graph::build_vit(
            std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
            const build_vit_opts & opts
        ) {
-    // batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode)
+    // batch dim: inp is [n_embd, n_pos, B]
    const int64_t B = inp->ne[2];

    if (learned_pos_embd) {
@@ -862,7 +864,7 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
    return cur;
 }

-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+static std::unique_ptr<clip_graph> clip_get_graph_builder(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
    const clip_image_f32 & img = *imgs.entries[0];
    std::unique_ptr<clip_graph> builder;

@@ -1025,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    // TODO [QWEN_VIDEO]: improve this in the future
    builder->n_batch = imgs.entries.size();

-    return builder->build();
+    return builder;
 }

 //
@@ -2819,7 +2821,7 @@ struct clip_model_loader {
        std::vector<support_info_op> ops;
    };

-    static void warmup(clip_ctx & ctx_clip) {
+    static clip_image_f32_batch get_dummy_batch(clip_ctx & ctx_clip) {
        // create a fake batch
        const auto & hparams = ctx_clip.model.hparams;
        clip_image_f32_batch batch;
@@ -2833,6 +2835,20 @@ struct clip_model_loader {
            LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
        }
        batch.entries.push_back(std::move(img));
+        return batch;
+    }
+
+    static void init_ctx(clip_ctx & ctx_clip) {
+        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
+
+        // check batching support
+        auto batch = get_dummy_batch(ctx_clip);
+        auto builder = clip_get_graph_builder(&ctx_clip, batch);
+        ctx_clip.support_batch = builder->support_batch();
+    }
+
+    static void warmup(clip_ctx & ctx_clip) {
+        auto batch = get_dummy_batch(ctx_clip);
        warmup(ctx_clip, batch);
    }

@@ -2905,9 +2921,7 @@ struct clip_model_loader {

    // only initialize backend buffers, but do not allocate them yet
    static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
-        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
-
-        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
+        ggml_cgraph * gf = clip_get_graph_builder(&ctx_clip, batch)->build();
        ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);

        ctx_clip.mem_compute.clear();
@@ -3070,6 +3084,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
            ctx_vision = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
            loader.load_tensors(*ctx_vision);
+            loader.init_ctx(*ctx_vision);
            if (ctx_params.warmup) {
                loader.warmup(*ctx_vision);
            }
@@ -3083,6 +3098,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
            ctx_audio = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
            loader.load_tensors(*ctx_audio);
+            loader.init_ctx(*ctx_audio);
            if (ctx_params.warmup) {
                loader.warmup(*ctx_audio);
            }
@@ -3484,25 +3500,22 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
    return n_patches;
 }

-bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, std::vector<float> & out_vec) {
    clip_image_f32_batch imgs;
    clip_image_f32_ptr img_copy(clip_image_f32_init());
    *img_copy = *img;
    imgs.entries.push_back(std::move(img_copy));

-    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
+    return clip_image_batch_encode(ctx, n_threads, &imgs, out_vec);
 }

-bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector<float> & out_batch_embd) {
    const clip_image_f32_batch & imgs = *imgs_c_ptr;
    int n_batch_cur = imgs.entries.size();

-    // maximum supported batch size, usually == 2 for qwen-vl-based models
-    int n_batch_max = clip_model_n_batch_max(ctx);
-
-    // TODO @ngxson : implement batch size > 1 as a loop
-    //                we don't need true batching support because the cgraph will gonna be big anyway
-    if (n_batch_cur > n_batch_max) {
+    // [QWEN_VIDEO] for video models, the batch dimension is used as temporal dimension for merged frames
+    if (!ctx->support_batch && n_batch_cur > clip_model_n_temporal_merge(ctx)) {
+        LOG_ERR("%s: batch size %d exceeds maximum supported batch/temporal-merge size %d\n", __func__, n_batch_cur, clip_model_n_temporal_merge(ctx));
        return false;
    }

@@ -3513,7 +3526,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima

    // build the inference graph
    ggml_backend_sched_reset(ctx->sched.get());
-    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+    ggml_cgraph * gf = clip_get_graph_builder(ctx, imgs)->build();
    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);

    // set inputs
@@ -3582,6 +3595,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            const int n  = nx * ny;

            for (int b = 0; b < n_batch_cur; b++) {
+                LOG_DBG("%s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n", __func__, b+1, n_batch_cur, nx, ny);
                const auto & buf = imgs.entries[b]->get_ro_buf();
                float * batch_entry = inp_raw.data() + b * (3*n);
                for (int y = 0; y < ny; y++) {
@@ -4416,7 +4430,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    // the last node is the embedding tensor
    ggml_tensor * embeddings = ggml_graph_node(gf, -1);

-    // sanity check (only support batch size of 1 for now)
+    // sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one)
    const int n_tokens_out = embeddings->ne[1];
    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
    if (n_tokens_out != expected_n_tokens_out) {
@@ -4424,16 +4438,26 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        GGML_ABORT("Invalid number of output tokens");
    }

-    // copy the embeddings to the location passed by the user
-    if (vec != nullptr) {
-        ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+    LOG_DBG("%s: output embedding shape [%d, %d, %d]\n", __func__,
+        (int)embeddings->ne[0], (int)embeddings->ne[1], (int)embeddings->ne[2]);
+
+    // copy output to user buffer if provided
+    // if output is empty, skip the copy
+    if (!out_batch_embd.empty()) {
+        if (out_batch_embd.size() != (size_t)ggml_nelements(embeddings)) {
+            LOG_ERR("%s: output buffer has %zu elements but expected %zu\n", __func__, out_batch_embd.size(), (size_t)ggml_nelements(embeddings));
+            GGML_ABORT("Output buffer size mismatch");
+        }
+        ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings));
+    } else {
+        LOG_WRN("%s: output buffer is empty, skipping copy\n", __func__);
    }

    // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
    if (ctx->debug_output_embeddings) {
        const int64_t n_embd = embeddings->ne[0];
        const int64_t n_tokens = embeddings->ne[1];
-        std::vector<float> emb_data(n_embd * n_tokens);
+        std::vector<float> emb_data(ggml_nelements(embeddings));
        ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));

        LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
@@ -4570,7 +4594,14 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }

-int clip_model_n_batch_max(const struct clip_ctx * ctx) {
+bool clip_support_batch(const struct clip_ctx * ctx) {
+    return ctx->support_batch;
+}
+
+// TODO @ngxson : this is no longer correct with mtmd_batch API
+// this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support)
+// this logic should be refactored in near future to distinctly handle "merge frames" and "batching"
+int clip_model_n_temporal_merge(const struct clip_ctx * ctx) {
    switch (ctx->proj_type()) {
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
@@ -97,8 +97,8 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
 size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
 struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data

-bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
-bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
+bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);

 bool clip_is_llava(const struct clip_ctx * ctx);
 // note for contributor: this clip_is_(model) pattern is deprecated
@@ -107,7 +107,9 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);

-int clip_model_n_batch_max(const struct clip_ctx * ctx);
+bool clip_support_batch(const struct clip_ctx * ctx);
+
+int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this

 std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);

@@ -10,7 +10,7 @@ ggml_cgraph * clip_graph_gemma4v::build() {
    ggml_set_name(inp_raw, "inp_raw_scaled");

    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
+    inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch);
    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
    ggml_set_name(inp, "inp");
    // note: no patch bias
@@ -51,10 +51,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
        // first half
        ggml_tensor * first;
        {
-            first = ggml_view_3d(ctx0, cur,
-                n_dim/2, n_head, n_pos,
+            first = ggml_view_4d(ctx0, cur,
+                n_dim/2, n_head, n_pos, n_batch,
                cur->nb[1],
                cur->nb[2],
+                cur->nb[3],
                0);
            first = ggml_rope_ext(
                ctx0,
@@ -70,10 +71,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
        // second half
        ggml_tensor * second;
        {
-            second = ggml_view_3d(ctx0, cur,
-                n_dim/2, n_head, n_pos,
+            second = ggml_view_4d(ctx0, cur,
+                n_dim/2, n_head, n_pos, n_batch,
                cur->nb[1],
                cur->nb[2],
+                cur->nb[3],
                n_dim/2 * ggml_element_size(cur));
            second = ggml_rope_ext(
                ctx0,
@@ -103,14 +105,14 @@ ggml_cgraph * clip_graph_gemma4v::build() {
        const int kernel_size = hparams.n_merge;
        GGML_ASSERT(kernel_size > 0);

-        // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
-        cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
+        // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, n_batch]
+        cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, n_batch);
        cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
                           kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
        const int out_x = n_patches_x / kernel_size;
        const int out_y = n_patches_y / kernel_size;
-        // [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
-        cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
+        // [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch]
+        cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch);
        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
        cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
        cb(cur, "pooled", -1);
@@ -16,6 +16,7 @@ struct clip_graph_gemma4v : clip_graph {
    clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
    ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
+    bool support_batch() const override { return true; }
 };

 struct clip_graph_gemma4uv : clip_graph {
@@ -67,8 +67,8 @@ MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image,

 // helper function that automatically:
 // 1. run llama_decode() on text chunks
-// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
-// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error
 // otherwise, returns 0 on success
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
@@ -157,13 +157,16 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
 } // extern "C"
 #endif

+#ifdef __cplusplus
+#include <set>
+#include <memory>
+
+namespace mtmd_helper {
+
 //
 // C++ wrappers
 //

-#ifdef __cplusplus
-namespace mtmd_helper {
-
 // video-related C++ wrappers
 struct mtmd_helper_video_deleter {
    void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); }
@@ -69,8 +69,8 @@ struct mtmd_bitmap {
        return data.size();
    }

-    bool can_batch_with(const mtmd_bitmap & other) const {
-        // [QWEN_VIDEO] can batch if both are images with same size
+    bool can_merge_with(const mtmd_bitmap & other) const {
+        // [QWEN_VIDEO] can (temporal) merge if both are images with same size
        return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny;
    }

@@ -90,12 +90,24 @@ struct mtmd_image_tokens {
    uint32_t ny = 0; // number of tokens in y direction
    mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
    uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
+    uint32_t n_temporal_merge = 1; // for qwen-vl style temporal merge
    uint32_t n_tokens() const {
        if (pos == MTMD_POS_TYPE_HUNYUANVL) {
            // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
            return (nx + 1) * ny + 2;
        }
-        return nx * ny;
+        // [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
+        if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
+            return nx * ny;
+        }
+        uint32_t nz = batch_f32.entries.size();
+        // TODO: simplify this by repeating the last frame until it fits the temporal merge
+        if (nz % n_temporal_merge != 0) {
+            nz = nz / n_temporal_merge + 1;
+        } else {
+            nz = nz / n_temporal_merge;
+        }
+        return nx * ny * nz;
    }
    clip_image_f32_batch batch_f32; // preprocessed image patches
    std::string id; // optional user-defined ID, useful for KV cache tracking
@@ -110,12 +122,17 @@ struct mtmd_image_tokens {
        return false;
    }

+    bool can_batch_with(const mtmd_image_tokens & other) {
+        return nx == other.nx && ny == other.ny && pos == other.pos;
+    }
+
    mtmd_image_tokens clone() {
        return mtmd_image_tokens{
            nx,
            ny,
            pos,
            image_idx,
+            n_temporal_merge,
            batch_f32.clone(),
            id
        };
@@ -153,12 +170,49 @@ struct mtmd_input_chunk {
    std::vector<llama_token> tokens_text;
    mtmd_image_tokens_ptr tokens_image;
    mtmd_audio_tokens_ptr tokens_audio;
+
+    bool can_batch_with(const mtmd_input_chunk & other) const {
+        if (type != other.type) {
+            return false;
+        }
+
+        if (tokens_image && other.tokens_image) {
+            return tokens_image->can_batch_with(*other.tokens_image);
+        }
+
+        // TODO: allow batching audio chunks of the same size
+
+        return false;
+    }
+
+    bool is_placeholder() const {
+        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            return tokens_image && tokens_image->is_placeholder();
+        } else if (type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            return tokens_audio && tokens_audio->is_placeholder();
+        }
+        return false;
+    }
 };

 struct mtmd_input_chunks {
    std::vector<mtmd_input_chunk> entries;
 };

+struct mtmd_batch {
+    mtmd_context * ctx;
+    std::vector<const mtmd_input_chunk *> entries;
+    std::vector<float> output_embd; // aggregated output embedding for the whole batch
+    mtmd_batch(mtmd_context * ctx): ctx(ctx) {}
+    int32_t n_tokens() const {
+        int32_t n = 0;
+        for (const auto * chunk : entries) {
+            n += mtmd_input_chunk_get_n_tokens(chunk);
+        }
+        return n;
+    }
+};
+
 // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
 // models not having it (llava-1.6) will process embeddings without any special tokens in-between
 enum mtmd_slice_tmpl {
@@ -197,6 +251,7 @@ mtmd_context_params mtmd_context_params_default() {
        /* image_max_tokens  */ -1,
        /* cb_eval           */ nullptr,
        /* cb_eval_user_data */ nullptr,
+        /* batch_max_tokens  */ 1024,
    };
    return params;
 }
@@ -204,7 +259,7 @@ mtmd_context_params mtmd_context_params_default() {
 struct mtmd_context {
    struct clip_ctx * ctx_v; // vision
    struct clip_ctx * ctx_a; // audio
-    std::vector<float> image_embd_v; // image embedding vector
+    std::vector<float> out_embd; // image embedding vector

    bool print_timings;
    int n_threads;
@@ -239,17 +294,21 @@ struct mtmd_context {
    std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
    std::unique_ptr<mtmd_image_preprocessor> image_preproc;

+    // batching
+    int32_t batch_max_tokens;
+
    // TODO @ngxson : add timings

    mtmd_context(const char * mmproj_fname,
                   const llama_model * text_model,
                   const mtmd_context_params & ctx_params,
                   bool no_alloc = false) :
-        print_timings(ctx_params.print_timings),
-        n_threads    (ctx_params.n_threads),
-        media_marker (ctx_params.media_marker),
-        n_embd_text  (text_model ? llama_model_n_embd_inp(text_model) : -1),
-        vocab        (text_model ? llama_model_get_vocab(text_model) : nullptr)
+        print_timings   (ctx_params.print_timings),
+        n_threads       (ctx_params.n_threads),
+        media_marker    (ctx_params.media_marker),
+        n_embd_text     (text_model ? llama_model_n_embd_inp(text_model) : -1),
+        vocab           (text_model ? llama_model_get_vocab(text_model) : nullptr),
+        batch_max_tokens(ctx_params.batch_max_tokens)
    {
        if (ctx_params.image_marker != nullptr) {
            throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
@@ -680,6 +739,16 @@ struct mtmd_context {
        return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
    }

+    int64_t n_embd_out() const {
+        if (ctx_v) {
+            return clip_n_mmproj_embd(ctx_v);
+        } else if (ctx_a) {
+            return clip_n_mmproj_embd(ctx_a);
+        } else {
+            throw std::runtime_error("no CLIP model loaded");
+        }
+    }
+
    ~mtmd_context() {
        clip_free(ctx_a);
        clip_free(ctx_v);
@@ -845,7 +914,7 @@ struct mtmd_tokenizer {
        // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
        int n_merge_frames = 1;
        if (ctx->ctx_v) {
-            n_merge_frames = clip_model_n_batch_max(ctx->ctx_v);
+            n_merge_frames = clip_model_n_temporal_merge(ctx->ctx_v);
            GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
        }

@@ -860,7 +929,7 @@ struct mtmd_tokenizer {
                if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) {
                    const mtmd_bitmap * bm_a = parts[i].bitmap;
                    const mtmd_bitmap * bm_b = parts[i + 1].bitmap;
-                    if (bm_a->can_batch_with(*bm_b)) {
+                    if (bm_a->can_merge_with(*bm_b)) {
                        LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1);
                        merged_bitmaps.push_back({bm_a, bm_b});
                        parts.erase(parts.begin() + i + 1); // collapse the second bitmap part
@@ -1103,13 +1172,17 @@ struct mtmd_tokenizer {
                size_t n_tokens = 0;
                for (const auto & e : batch_f32.entries) {
                    n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
-                    if (clip_model_n_batch_max(ctx->ctx_v) == 2) {
+                    if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
                        // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
                        break;
                    }
                }

                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+
+                // [QWEN_VIDEO] improve this in the future
+                image_tokens->n_temporal_merge = clip_model_n_temporal_merge(ctx->ctx_v);
+
                if (mtmd_decode_use_mrope(ctx)) {
                    // for Qwen2VL, we need this information for M-RoPE decoding positions
                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
@@ -1327,60 +1400,18 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
    }
 }

-int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
-    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-        LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
-        return 0;
-    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-        if (!ctx->ctx_v) {
-            LOG_ERR("%s: model does not support vision input\n", __func__);
-            return 1;
-        }
-        if (chunk->tokens_image == nullptr) {
-            LOG_ERR("%s: image tokens are null\n", __func__);
-            return 1;
-        }
-        if (chunk->tokens_image->is_placeholder()) {
-            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
-            return 1;
-        }
-        return mtmd_encode(ctx, chunk->tokens_image.get());
-    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
-        if (!ctx->ctx_a) {
-            LOG_ERR("%s: model does not support audio input\n", __func__);
-            return 1;
-        }
-        if (chunk->tokens_audio == nullptr) {
-            LOG_ERR("%s: audio tokens are null\n", __func__);
-            return 1;
-        }
-        if (chunk->tokens_audio->is_placeholder()) {
-            LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
-            return 1;
-        }
-        int n_mmproj_embd = ctx->n_embd_text;
-        ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
-        bool ok = clip_image_batch_encode(
-            ctx->ctx_a,
-            ctx->n_threads,
-            &chunk->tokens_audio->batch_f32,
-            ctx->image_embd_v.data());
-        return ok ? 0 : 1;
-    }
-
-    LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
-    return 1;
-}
-
-int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
+static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> & out_embd) {
    clip_ctx * ctx_clip = ctx->ctx_v;
    if (!ctx_clip) {
        LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
        return 1;
    }
    auto proj_type = clip_get_projector_type(ctx_clip);
-    int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
-    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
+
+    int n_embd_out = ctx->n_embd_out();
+    auto n_tokens_out = image_tokens->n_tokens();
+    out_embd.resize((size_t)n_embd_out * n_tokens_out);
+
    bool ok = false;

    if (clip_is_llava(ctx_clip)
@@ -1400,12 +1431,19 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
                return 1;
            }
            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
-            ok = clip_image_encode(
+            std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
+            bool ok_i = clip_image_encode(
                ctx_clip,
                ctx->n_threads,
                entries[i].get(),
-                ctx->image_embd_v.data() + offset);
-            offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
+                tmp_embd);
+            if (!ok_i) {
+                LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
+                return 1;
+            }
+            ok = true;
+            std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
+            offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
        }
    } else {
        if (image_tokens->is_placeholder()) {
@@ -1416,14 +1454,206 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
            ctx_clip,
            ctx->n_threads,
            &image_tokens->batch_f32,
-            ctx->image_embd_v.data());
+            out_embd);
    }

    return ok ? 0 : 1;
 }

+static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector<float> & out_embd) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
+        return 0;
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: model does not support vision input\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_image == nullptr) {
+            LOG_ERR("%s: image tokens are null\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_image->is_placeholder()) {
+            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
+            return 1;
+        }
+        return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_embd);
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        if (!ctx->ctx_a) {
+            LOG_ERR("%s: model does not support audio input\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_audio == nullptr) {
+            LOG_ERR("%s: audio tokens are null\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_audio->is_placeholder()) {
+            LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
+            return 1;
+        }
+        int n_mmproj_embd = ctx->n_embd_out();
+        out_embd.resize((size_t)chunk->tokens_audio->n_tokens * n_mmproj_embd);
+        bool ok = clip_image_batch_encode(
+            ctx->ctx_a,
+            ctx->n_threads,
+            &chunk->tokens_audio->batch_f32,
+            out_embd);
+        return ok ? 0 : 1;
+    }
+
+    LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
+    return 1;
+}
+
+int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
+    // this is the non-batching version
+    try {
+        return mtmd_encode_chunk_impl(ctx, chunk, ctx->out_embd);
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return 1;
+    }
+}
+
+int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
+    try {
+        return mtmd_encode_impl(ctx, image_tokens, ctx->out_embd);
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return 1;
+    }
+}
+
 float * mtmd_get_output_embd(mtmd_context * ctx) {
-    return ctx->image_embd_v.data();
+    return ctx->out_embd.data();
+}
+
+mtmd_batch * mtmd_batch_init(mtmd_context * ctx) {
+    return new mtmd_batch(ctx);
+}
+
+void mtmd_batch_free(mtmd_batch * batch) {
+    if (batch) {
+        delete batch;
+    }
+}
+
+int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_ERR("%s: text chunk is not supported in batch\n", __func__);
+        return 1;
+    }
+
+    auto * ctx = batch->ctx->get_clip_ctx(chunk);
+    if (!ctx) {
+        LOG_ERR("%s: model does not support input chunk type %d\n", __func__, (int)chunk->type);
+        return 1;
+    }
+
+    if (batch->entries.empty()) {
+        // batch must have at least one chunk
+        batch->entries.push_back(chunk);
+        return 0;
+    }
+
+    if (!clip_support_batch(ctx)) {
+        // if no batching support, batch can only have one single chunk
+        return 2; // "batch too large" error code
+    }
+
+    int32_t new_n_tokens = batch->n_tokens() + (int32_t)mtmd_input_chunk_get_n_tokens(chunk);
+    if (new_n_tokens > batch->ctx->batch_max_tokens) {
+        return 2; // "batch too large" error code
+    }
+
+    auto & first_chunk = batch->entries[0];
+    if (first_chunk->can_batch_with(*chunk)) {
+        batch->entries.push_back(chunk);
+        return 0;
+    }
+
+    return 3; // "cannot batch" error code
+}
+
+static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) {
+    if (batch->entries.empty()) {
+        LOG_ERR("%s: batch is empty\n", __func__);
+        return 1;
+    }
+    for (const auto * chunk : batch->entries) {
+        if (chunk->is_placeholder()) {
+            LOG_ERR("%s: chunk is placeholder\n", __func__);
+            return 1;
+        }
+    }
+
+    // represent the whole batch as one single chunk
+    mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0]));
+    if (batch_chunk->tokens_image) {
+        auto & b0_f32 = batch_chunk->tokens_image->batch_f32;
+        // copy all entries from other chunks into the first chunk's batch_f32
+        // note: skip first entry because it's already in batch_chunk
+        for (size_t ic = 1; ic < batch->entries.size(); ic++) {
+            auto & chunk = batch->entries[ic];
+            GGML_ASSERT(chunk->tokens_image);
+            auto b1_f32 = chunk->tokens_image->batch_f32.clone();
+            for (size_t i = 0; i < b1_f32.entries.size(); i++) {
+                b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
+            }
+        }
+    } else if (batch_chunk->tokens_audio) {
+        auto & b0_f32 = batch_chunk->tokens_audio->batch_f32;
+        // copy all entries from other chunks into the first chunk's batch_f32
+        // note: skip first entry because it's already in batch_chunk
+        for (size_t ic = 1; ic < batch->entries.size(); ic++) {
+            auto & chunk = batch->entries[ic];
+            GGML_ASSERT(chunk->tokens_audio);
+            auto b1_f32 = chunk->tokens_audio->batch_f32.clone();
+            for (size_t i = 0; i < b1_f32.entries.size(); i++) {
+                b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
+            }
+        }
+    } else {
+        LOG_ERR("%s: unsupported chunk type\n", __func__);
+        return 1;
+    }
+
+    LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n",
+            __func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get()));
+    int32_t res = mtmd_encode_chunk_impl(
+        batch->ctx,
+        batch_chunk.get(),
+        batch->output_embd);
+    return res;
+}
+
+int32_t mtmd_batch_encode(mtmd_batch * batch) {
+    try {
+        return mtmd_batch_encode_impl(batch);
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return 1;
+    }
+}
+
+float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
+    if (batch->output_embd.empty()) {
+        LOG_ERR("%s: batch has not been encoded yet\n", __func__);
+        return nullptr;
+    }
+    size_t offset = 0;
+    const size_t n_embd = batch->ctx->n_embd_out();
+    for (const auto * c : batch->entries) {
+        size_t offset_prev = offset;
+        size_t n_tokens = mtmd_input_chunk_get_n_tokens(c);
+        offset += n_tokens * n_embd;
+        GGML_ASSERT(offset_prev <  batch->output_embd.size());
+        GGML_ASSERT(offset      <= batch->output_embd.size());
+        if (c == chunk) {
+            return &batch->output_embd.data()[offset_prev];
+        }
+    }
+    return nullptr; // not found
 }

 bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@@ -1801,7 +2031,7 @@ static void mtmd_debug_encode_impl(mtmd_context * ctx, clip_ctx * ctx_clip, clip
        ctx_clip,
        ctx->n_threads,
        &image,
-        embd_output.data());
+        embd_output);
    if (!ok) {
        LOG_ERR("%s: failed to encode image\n", __func__);
    }
@@ -63,6 +63,7 @@ struct mtmd_bitmap;
 struct mtmd_image_tokens;
 struct mtmd_input_chunk;
 struct mtmd_input_chunks;
+struct mtmd_batch;

 struct mtmd_input_text {
    const char * text;
@@ -80,6 +81,7 @@ typedef struct mtmd_image_tokens mtmd_image_tokens;
 typedef struct mtmd_input_chunk  mtmd_input_chunk;
 typedef struct mtmd_input_chunks mtmd_input_chunks;
 typedef struct mtmd_input_text   mtmd_input_text;
+typedef struct mtmd_batch        mtmd_batch;

 struct mtmd_context_params {
    bool use_gpu;
@@ -97,6 +99,11 @@ struct mtmd_context_params {
    // callback function passed over to mtmd proper
    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
+
+    // batching params
+    int32_t batch_max_tokens; // maximum number of output tokens in a batch
+                              // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
+                              // (default: 1024)
 };

 MTMD_API const char * mtmd_default_marker(void);
@@ -265,12 +272,12 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                               const mtmd_bitmap ** bitmaps,
                               size_t n_bitmaps);

-// returns 0 on success
-// TODO: deprecate
-MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
-                             const mtmd_image_tokens * image_tokens);
+DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens),
+           "use mtmd_encode_chunk() instead");

+// text chunk will be ignored silently, only media chunk will be encoded
 // returns 0 on success
+// returns 1 on generic error
 MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
                                   const mtmd_input_chunk * chunk);

@@ -279,6 +286,26 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
 // llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);

+
+// batch encoding API
+// chunks are not owned by the batch, they will not be freed by mtmd_batch_free()
+// batch is valid for a given context, cannot be shared across contexts
+MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx);
+MTMD_API void         mtmd_batch_free(mtmd_batch * batch);
+
+// only media chunks are allowed, text chunks will be rejected
+// returns 0 on success
+// returns 1 on generic error
+// returns 2 if the batch is too large (chunk won't be added)
+// returns 3 if it cannot be batched with the existing chunks in the batch
+MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+
+// returns 0 on success
+// returns 1 on generic error
+MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch);
+MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+
+
 // Set callback for all future logging events.
 // If this is not called, or NULL is supplied, everything is output on stderr.
 MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
@@ -336,6 +363,11 @@ struct mtmd_input_chunk_deleter {
 };
 using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;

+struct mtmd_batch_deleter {
+    void operator()(mtmd_batch * val) { mtmd_batch_free(val); }
+};
+using batch_ptr = std::unique_ptr<mtmd_batch, mtmd_batch_deleter>;
+
 struct bitmap {
    bitmap_ptr ptr;
    bitmap() : ptr(nullptr) {}
@@ -344,6 +344,14 @@ const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const {
    throw std::runtime_error("Chunk not found");
 }

+std::pair<const mtmd::input_chunk_ptr *, size_t> server_tokens::find_next_media_chunk(size_t idx) const {
+    auto it = map_idx_to_media.upper_bound(idx);
+    if (it != map_idx_to_media.end()) {
+        return { &it->second, it->first };
+    }
+    return { nullptr, 0 };
+}
+
 void server_tokens::push_back(llama_token tok) {
    if (tok == LLAMA_TOKEN_NULL) {
        throw std::runtime_error("Invalid token");
@@ -1126,9 +1134,9 @@ json oaicompat_chat_params_parse(

    // Reasoning budget: pass parameters through to sampling layer
    {
-        int reasoning_budget = opt.reasoning_budget;
-        if (reasoning_budget == -1 && body.contains("thinking_budget_tokens")) {
-            reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
+        int reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
+        if (reasoning_budget == -1) {
+            reasoning_budget = opt.reasoning_budget;
        }

        if (!chat_params.thinking_end_tag.empty()) {
@@ -180,6 +180,10 @@ public:

    const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;

+    // find next media chunk after idx
+    // returns a pair of pointer to the chunk (nullptr if not found) and its start index in tokens
+    std::pair<const mtmd::input_chunk_ptr *, size_t> find_next_media_chunk(size_t idx) const;
+
    void push_back(llama_token tok);

    // will create a copy of the chunk if it contains non-text data
@@ -80,6 +80,8 @@ struct server_slot {

    // multimodal
    mtmd_context * mctx = nullptr;
+    mtmd::batch_ptr mbatch = nullptr;
+    std::array<llama_context *, 2> mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context

    // speculative decoding
    common_speculative * spec;
@@ -239,6 +241,18 @@ struct server_slot {

        // clear alora start
        alora_invocation_start = -1;
+
+        // clear multimodal state
+        mbatch.reset();
+        mtgt[0] = ctx_tgt;
+        mtgt[1] = nullptr;
+        if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) {
+            // TODO: in the future, figure out how to infuse target embeddings to the images
+            //       for now, we re-decode the same chunk in both ctx_tgt and ctx_dft
+            //       maybe we simply need to call `common_speculative_process()` ?
+            //       [TAG_MTMD_DRAFT_PROCESSING]
+            mtgt[1] = ctx_dft;
+        }
    }

    void init_sampler() const {
@@ -578,6 +592,87 @@ struct server_slot {
        other.prompt = prompt.clone();
        other.init_sampler();
    }
+
+    // returns 0 on success
+    // caller need to update prompt.tokens after a successful call to keep track of the processing progress
+    int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) {
+        GGML_ASSERT(mctx);
+        const auto & input_tokens = task->tokens;
+        auto & chunk = input_tokens.find_chunk(idx);
+        int32_t res = 0;
+
+        auto try_decode = [&]() -> int32_t {
+            if (mbatch) {
+                float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get());
+                if (embd) {
+                    for (auto * lctx : mtgt) {
+                        if (lctx == nullptr) {
+                            continue;
+                        }
+                        llama_pos new_n_past; // unused for now
+                        res = mtmd_helper_decode_image_chunk(
+                            mctx,
+                            lctx,
+                            chunk.get(),
+                            embd,
+                            prompt.tokens.pos_next(),
+                            id,
+                            llama_n_batch(lctx),
+                            &new_n_past
+                        );
+                        if (res != 0) {
+                            SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
+                            return -1;
+                        }
+                    }
+                    n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
+                    return 0; // success
+                }
+            }
+            return 1; // (non-error) need to create & encode batch
+        };
+
+        // if the batch is already exist, try searching & encode
+        res = try_decode();
+        if (res == 0) {
+            return 0;
+        } else if (res < 0) {
+            // fatal error
+            return res;
+        }
+
+        // otherwise, the batch is either uninitialized or is used up
+        // we need to create & encode a new batch
+        mbatch.reset(mtmd_batch_init(mctx));
+        res = mtmd_batch_add_chunk(mbatch.get(), chunk.get());
+        GGML_ASSERT(res == 0); // we should never have an empty batch
+
+        // try batching as much as possible
+        int n_added = 1;
+        size_t idx_cur = idx;
+        while (res == 0) {
+            auto [next_chunk, next_idx] = input_tokens.find_next_media_chunk(idx_cur);
+            if (next_chunk == nullptr) {
+                break;
+            }
+            res = mtmd_batch_add_chunk(mbatch.get(), next_chunk->get());
+            n_added += (res == 0 ? 1 : 0);
+            idx_cur = next_idx;
+            SLT_DBG(*this, "try adding media chunk idx = %zu to batch, res = %d\n", next_idx, res);
+            // if res != 0, batch is full or chunk is not compatible -> this loop breaks
+        }
+
+        // TODO @ngxson : move this log line to debug when it become more stable
+        SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added);
+
+        res = mtmd_batch_encode(mbatch.get());
+        if (res != 0) {
+            SLT_ERR(*this, "failed to encode mtmd batch for chunk idx = %zu, res = %d\n", idx, res);
+            return -1;
+        }
+
+        return try_decode();
+    }
 };


@@ -781,6 +876,7 @@ private:
            mparams.warmup           = params_base.warmup;
            mparams.image_min_tokens = params_base.image_min_tokens;
            mparams.image_max_tokens = params_base.image_max_tokens;
+            mparams.batch_max_tokens = params_base.mtmd_batch_max_tokens;
            mparams.media_marker     = get_media_marker();
        }

@@ -2928,7 +3024,7 @@ private:
                                send_partial_response(slot, {}, false, true);
                            }
                        }
-                    }
+                    } // end of SLOT_STATE_STARTED

                    if (!slot.can_split()) {
                        // cannot fit the prompt in the current batch - will try next iter
@@ -2983,10 +3079,18 @@ private:
                    bool has_mtmd = false;

                    // check if we should process the image
-                    while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
+                    while (true) {
+                        auto cur_token_idx = slot.prompt.n_tokens();
+                        if (
+                            cur_token_idx >= slot.task->n_tokens() ||
+                            input_tokens[cur_token_idx] != LLAMA_TOKEN_NULL // encountered a text token
+                        ) {
+                            break;
+                        }
+
                        // process the image
                        size_t n_tokens_out = 0;
-                        int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
+                        int32_t res = slot.process_mtmd_chunk(cur_token_idx, n_tokens_out);
                        if (res != 0) {
                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
@@ -2994,22 +3098,11 @@ private:
                            continue;
                        }

-                        if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) {
-                            // TODO: in the future, figure out how to infuse target embeddings to the images
-                            //       for now, we skip this for simplicity
-                            //       maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above?
-                            //       [TAG_MTMD_DRAFT_PROCESSING]
-                            res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
-                            if (res != 0) {
-                                GGML_ABORT("failed to process multi-modal data on draft context\n");
-                            }
-                        }
-
                        slot.n_prompt_tokens_processed += n_tokens_out;

                        // add the image chunk to cache
                        {
-                            const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
+                            const auto & chunk = input_tokens.find_chunk(cur_token_idx);
                            slot.prompt.tokens.push_back(chunk.get()); // copy
                        }

@@ -113,7 +113,7 @@ bool server_http_context::init(const common_params & params) {
 #endif

    srv->set_default_headers({{"Server", "llama.cpp"}});
-    srv->set_logger(log_server_request);
+    // srv->set_logger(log_server_request); // TODO @ngxson : this is too spamy, no very useful; improve it in the future
    srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
        // this is fail-safe; exceptions should already handled by `ex_wrapper`

@@ -169,29 +169,108 @@ bool server_http_context::init(const common_params & params) {
        SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size());
    }

+    //
+    // Helper: Generate iOS splash screen paths from device dimensions
+    // This centralizes PWA asset paths to avoid duplication across CMake, C++, and TypeScript.
+    // Source of truth: tools/ui/src/lib/constants/pwa.ts (APPLE_DEVICES)
+    //
+    auto generate_splash_endpoints = []() -> std::vector<std::string> {
+        // Apple device dimensions (width x height) with orientation and color scheme
+        // Format: "orientation-dimension1xdimension2.png" or "orientation-dark-dimension1xdimension2.png"
+        // Based on https://developer.apple.com/design/human-interface-guidelines/app-icons
+        static const std::vector<std::pair<std::string, std::string>> splash_specs = {
+            // Portrait screens (light)
+            {"portrait", "640x1136"},     {"portrait", "750x1334"},
+            {"portrait", "1170x2532"},    {"portrait", "1179x2556"},
+            {"portrait", "1206x2622"},    {"portrait", "1284x2778"},
+            {"portrait", "1290x2796"},    {"portrait", "1320x2868"},
+            {"portrait", "1488x2266"},    {"portrait", "1640x2360"},
+            {"portrait", "1668x2388"},    {"portrait", "2048x2732"},
+            // Landscape screens (light) - dimensions swapped
+            {"landscape", "1136x640"},    {"landscape", "1334x750"},
+            {"landscape", "2532x1170"},   {"landscape", "2556x1179"},
+            {"landscape", "2622x1206"},   {"landscape", "2778x1284"},
+            {"landscape", "2796x1290"},   {"landscape", "2868x1320"},
+            {"landscape", "2266x1488"},   {"landscape", "2360x1640"},
+            {"landscape", "2388x1668"},   {"landscape", "2732x2048"},
+            // Portrait screens (dark)
+            {"portrait-dark", "640x1136"}, {"portrait-dark", "750x1334"},
+            {"portrait-dark", "1170x2532"}, {"portrait-dark", "1179x2556"},
+            {"portrait-dark", "1206x2622"}, {"portrait-dark", "1284x2778"},
+            {"portrait-dark", "1290x2796"}, {"portrait-dark", "1320x2868"},
+            {"portrait-dark", "1488x2266"}, {"portrait-dark", "1640x2360"},
+            {"portrait-dark", "1668x2388"}, {"portrait-dark", "2048x2732"},
+            // Landscape screens (dark)
+            {"landscape-dark", "1136x640"}, {"landscape-dark", "1334x750"},
+            {"landscape-dark", "2532x1170"}, {"landscape-dark", "2556x1179"},
+            {"landscape-dark", "2622x1206"}, {"landscape-dark", "2778x1284"},
+            {"landscape-dark", "2796x1290"}, {"landscape-dark", "2868x1320"},
+            {"landscape-dark", "2266x1488"}, {"landscape-dark", "2360x1640"},
+            {"landscape-dark", "2388x1668"}, {"landscape-dark", "2732x2048"}
+        };
+
+        std::vector<std::string> endpoints;
+        endpoints.reserve(splash_specs.size());
+        for (const auto & [orientation, dimensions] : splash_specs) {
+            endpoints.push_back("/apple-splash-" + orientation + "-" + dimensions + ".png");
+        }
+        return endpoints;
+    };
+
    //
    // Middlewares
    //

-    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
-        static const std::unordered_set<std::string> public_endpoints = {
+    // Public endpoints list - includes health, UI, and PWA assets
+    // Source of truth for splash screen paths: tools/ui/src/lib/constants/pwa.ts (APPLE_DEVICES)
+    static const std::unordered_set<std::string> get_public_endpoints = [generate_splash_endpoints]() {
+        std::unordered_set<std::string> endpoints {
            "/health",
            "/v1/health",
            "/models",
            "/v1/models",
            "/",
            "/index.html",
-            "/bundle.js",
-            "/bundle.css",
+            // PWA assets
+            "/favicon.ico",
+            "/favicon-dark.ico",
+            "/favicon.svg",
+            "/favicon-dark.svg",
+            "/pwa-64x64.png",
+            "/pwa-192x192.png",
+            "/pwa-512x512.png",
+            "/maskable-icon-512x512.png",
+            "/apple-touch-icon-180x180.png",
+            // iOS splash screens (generated from APPLE_DEVICES in TypeScript)
+            // PWA runtime files
+            "/manifest.webmanifest",
+            "/sw.js",
+            "/version.json",
+            "/workbox-<hash>.js",
+            "/_app/version.json",
+            "/build.json"
        };
+        // Add all splash screen endpoints
+        auto splash = generate_splash_endpoints();
+        for (const auto & path : splash) {
+            endpoints.insert(path);
+        }
+        return endpoints;
+    }();

+    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
        // If API key is not set, skip validation
        if (api_keys.empty()) {
            return true;
        }

        // If path is public or static file, skip validation
-        if (public_endpoints.find(req.path) != public_endpoints.end()) {
+        if (get_public_endpoints.find(req.path) != get_public_endpoints.end()) {
+            return true;
+        }
+        // Static assets (_app/ files, workbox runtime). These are embedded at build time
+        // so no API key is needed — browsers fetch them directly.
+        if (req.path.find("/_app/") == 0 || req.path.find("/workbox-") == 0) {
            return true;
        }

@@ -315,7 +394,11 @@ bool server_http_context::init(const common_params & params) {
            }
        } else {
 #if defined(LLAMA_UI_HAS_ASSETS)
-            auto serve_asset = [](const std::string & name, const char * mime, bool with_isolation_headers) {
+            // Embedded assets are immutable — cache aggressively for PWA/sw offline support.
+            // PWA runtime files (sw.js, manifest, version.json) use no-cache for revalidation.
+            // Bundle files use Vite content hashes (bundle.<hash>.js/css) so each build
+            // produces a different filename — browsers naturally get a fresh copy on upgrade.
+            auto serve_asset_cached = [](const std::string & name, const char * mime, bool with_isolation_headers) {
                return [name, mime, with_isolation_headers](const httplib::Request & req, httplib::Response & res) {
                    const llama_ui_asset * a = llama_ui_find_asset(name.c_str());
                    if (!a) {
@@ -334,14 +417,129 @@ bool server_http_context::init(const common_params & params) {
                        res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
                        res.set_header("Cross-Origin-Opener-Policy", "same-origin");
                    }
+                    res.set_header("Cache-Control", "public, max-age=31536000, immutable");
                    res.set_content(reinterpret_cast<const char*>(a->data), a->size, mime);
                    return false;
                };
            };

-            srv->Get(params.api_prefix + "/",           serve_asset("index.html", "text/html; charset=utf-8",              true));
-            srv->Get(params.api_prefix + "/bundle.js",  serve_asset("bundle.js",  "application/javascript; charset=utf-8", false));
-            srv->Get(params.api_prefix + "/bundle.css", serve_asset("bundle.css", "text/css; charset=utf-8",               false));
+            auto serve_asset_nocache = [](const std::string & name, const char * mime, bool with_isolation_headers) {
+                return [name, mime, with_isolation_headers](const httplib::Request & /*req*/, httplib::Response & res) {
+                    const llama_ui_asset * a = llama_ui_find_asset(name.c_str());
+                    if (!a) {
+                        res.status = 404;
+                        return false;
+                    }
+                    if (with_isolation_headers) {
+                        res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                        res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                    }
+                    res.set_header("Cache-Control", "no-cache");
+                    res.set_content(reinterpret_cast<const char*>(a->data), a->size, mime);
+                    return false;
+                };
+            };
+
+            // Bundle files in _app/immutable/ — SvelteKit outputs them here and index.html
+            // and sw.js reference them via these paths (vanilla build, no plugin).
+            auto serve_bundle = [serve_asset_cached](const httplib::Request & req, httplib::Response & res) {
+                std::string path = req.path;
+                std::string name;
+                const char * mime;
+                if (path.rfind("/_app/immutable/bundle.", 0) == 0 && path.size() > 22) {
+                    name = path.substr(1);  // strip leading /
+                    mime = "application/javascript; charset=utf-8";
+                } else if (path.rfind("/_app/immutable/assets/bundle.", 0) == 0 && path.size() > 30) {
+                    name = path.substr(1);  // strip leading /
+                    mime = "text/css; charset=utf-8";
+                } else {
+                    res.status = 404;
+                    return false;
+                }
+                return serve_asset_cached(name, mime, false)(req, res);
+            };
+
+            // _app/ paths — vanilla SvelteKit output, index.html and sw.js reference
+            // bundles and version.json here directly.
+            srv->Get(params.api_prefix + R"(/_app/immutable/bundle\.[^/]+\.js)",  serve_bundle);
+            srv->Get(params.api_prefix + R"(/_app/immutable/assets/bundle\.[^/]+\.css)", serve_bundle);
+            srv->Get(params.api_prefix + "/_app/version.json",                    serve_asset_cached("_app/version.json", "application/json; charset=utf-8", false));
+
+            auto serve_workbox = [serve_asset_cached](const httplib::Request & req, httplib::Response & res) {
+                std::string name = req.path.substr(1);
+                if (name.rfind("workbox-", 0) == 0 && name.size() > 10) {
+                    return serve_asset_cached(name, "application/javascript; charset=utf-8", false)(req, res);
+                }
+                res.status = 404;
+                return false;
+            };
+            srv->Get(params.api_prefix + R"(/workbox-[^/]+\.js)",               serve_workbox);
+            srv->Get(params.api_prefix + R"(/sw\.js)",                          serve_asset_cached("sw.js",               "application/javascript; charset=utf-8", false));
+            srv->Get(params.api_prefix + "/manifest.webmanifest",                serve_asset_cached("manifest.webmanifest", "application/manifest+json; charset=utf-8", false));
+            srv->Get(params.api_prefix + "/version.json",                        serve_asset_cached("_app/version.json",  "application/json; charset=utf-8",       false));
+            srv->Get(params.api_prefix + "/build.json",                          serve_asset_cached("build.json",         "application/json; charset=utf-8",       false));
+
+            // Finally serve index.html for all other routes (SPA fallback)
+            srv->Get(params.api_prefix + "/",                               serve_asset_cached("index.html",                   "text/html; charset=utf-8",                 true));
+            srv->Get(params.api_prefix + "/favicon.ico",                    serve_asset_cached("favicon.ico",                  "image/x-icon",                             false));
+            srv->Get(params.api_prefix + "/favicon-dark.ico",                serve_asset_cached("favicon-dark.ico",              "image/x-icon",                             false));
+            srv->Get(params.api_prefix + "/favicon.svg",                    serve_asset_cached("favicon.svg",                  "image/svg+xml",                            false));
+            srv->Get(params.api_prefix + "/favicon-dark.svg",              serve_asset_cached("favicon-dark.svg",            "image/svg+xml",                            false));
+            srv->Get(params.api_prefix + "/pwa-64x64.png",                  serve_asset_cached("pwa-64x64.png",                "image/png",                                false));
+            srv->Get(params.api_prefix + "/pwa-192x192.png",                serve_asset_cached("pwa-192x192.png",              "image/png",                                false));
+            srv->Get(params.api_prefix + "/pwa-512x512.png",                serve_asset_cached("pwa-512x512.png",              "image/png",                                false));
+            srv->Get(params.api_prefix + "/maskable-icon-512x512.png",      serve_asset_cached("maskable-icon-512x512.png",    "image/png",                                false));
+            srv->Get(params.api_prefix + "/apple-touch-icon-180x180.png",   serve_asset_cached("apple-touch-icon-180x180.png", "image/png",                                false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-640x1136.png",          serve_asset_cached("apple-splash-portrait-640x1136.png",          "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-1136x640.png",         serve_asset_cached("apple-splash-landscape-1136x640.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-750x1334.png",          serve_asset_cached("apple-splash-portrait-750x1334.png",          "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-1334x750.png",         serve_asset_cached("apple-splash-landscape-1334x750.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1170x2532.png",         serve_asset_cached("apple-splash-portrait-1170x2532.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2532x1170.png",        serve_asset_cached("apple-splash-landscape-2532x1170.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1179x2556.png",         serve_asset_cached("apple-splash-portrait-1179x2556.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2556x1179.png",        serve_asset_cached("apple-splash-landscape-2556x1179.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1206x2622.png",         serve_asset_cached("apple-splash-portrait-1206x2622.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2622x1206.png",        serve_asset_cached("apple-splash-landscape-2622x1206.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1284x2778.png",         serve_asset_cached("apple-splash-portrait-1284x2778.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2778x1284.png",        serve_asset_cached("apple-splash-landscape-2778x1284.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1290x2796.png",         serve_asset_cached("apple-splash-portrait-1290x2796.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2796x1290.png",        serve_asset_cached("apple-splash-landscape-2796x1290.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1320x2868.png",         serve_asset_cached("apple-splash-portrait-1320x2868.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2868x1320.png",        serve_asset_cached("apple-splash-landscape-2868x1320.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1488x2266.png",         serve_asset_cached("apple-splash-portrait-1488x2266.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2266x1488.png",        serve_asset_cached("apple-splash-landscape-2266x1488.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1640x2360.png",         serve_asset_cached("apple-splash-portrait-1640x2360.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2360x1640.png",        serve_asset_cached("apple-splash-landscape-2360x1640.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1668x2388.png",         serve_asset_cached("apple-splash-portrait-1668x2388.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2388x1668.png",        serve_asset_cached("apple-splash-landscape-2388x1668.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-2048x2732.png",         serve_asset_cached("apple-splash-portrait-2048x2732.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2732x2048.png",        serve_asset_cached("apple-splash-landscape-2732x2048.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-640x1136.png",     serve_asset_cached("apple-splash-portrait-dark-640x1136.png",     "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-1136x640.png",    serve_asset_cached("apple-splash-landscape-dark-1136x640.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-750x1334.png",     serve_asset_cached("apple-splash-portrait-dark-750x1334.png",     "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-1334x750.png",    serve_asset_cached("apple-splash-landscape-dark-1334x750.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1170x2532.png",    serve_asset_cached("apple-splash-portrait-dark-1170x2532.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2532x1170.png",   serve_asset_cached("apple-splash-landscape-dark-2532x1170.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1179x2556.png",    serve_asset_cached("apple-splash-portrait-dark-1179x2556.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2556x1179.png",   serve_asset_cached("apple-splash-landscape-dark-2556x1179.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1206x2622.png",    serve_asset_cached("apple-splash-portrait-dark-1206x2622.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2622x1206.png",   serve_asset_cached("apple-splash-landscape-dark-2622x1206.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1284x2778.png",    serve_asset_cached("apple-splash-portrait-dark-1284x2778.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2778x1284.png",   serve_asset_cached("apple-splash-landscape-dark-2778x1284.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1290x2796.png",    serve_asset_cached("apple-splash-portrait-dark-1290x2796.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2796x1290.png",   serve_asset_cached("apple-splash-landscape-dark-2796x1290.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1320x2868.png",    serve_asset_cached("apple-splash-portrait-dark-1320x2868.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2868x1320.png",   serve_asset_cached("apple-splash-landscape-dark-2868x1320.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1640x2360.png",    serve_asset_cached("apple-splash-portrait-dark-1640x2360.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2360x1640.png",   serve_asset_cached("apple-splash-landscape-dark-2360x1640.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1668x2388.png",    serve_asset_cached("apple-splash-portrait-dark-1668x2388.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2388x1668.png",   serve_asset_cached("apple-splash-landscape-dark-2388x1668.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-2048x2732.png",    serve_asset_cached("apple-splash-portrait-dark-2048x2732.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2732x2048.png",   serve_asset_cached("apple-splash-landscape-dark-2732x2048.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/manifest.webmanifest",           serve_asset_nocache("manifest.webmanifest",        "application/manifest+json",                false));
+            srv->Get(params.api_prefix + "/sw.js",                          serve_asset_nocache("sw.js",                       "application/javascript; charset=utf-8",    false));
+            srv->Get(params.api_prefix + "/version.json",                   serve_asset_nocache("version.json",                 "application/json",                         false));
+
 #endif
        }
    }
@@ -26,7 +26,7 @@ def test_access_static_assets_without_api_key():
    """Static web UI assets should not require API key authentication (issue #21229)"""
    global server
    server.start()
-    for path in ["/", "/bundle.js", "/bundle.css"]:
+    for path in ["/", "/sw.js", "/manifest.webmanifest", "/_app/version.json"]:
        res = server.make_request("GET", path)
        assert res.status_code == 200, f"Expected 200 for {path}, got {res.status_code}"

@@ -8,6 +8,8 @@ node_modules
 .wrangler
 /.svelte-kit
 /build
+dev-dist
+dist

 # OS
 .DS_Store
@@ -23,6 +25,15 @@ Thumbs.db
 vite.config.js.timestamp-*
 vite.config.ts.timestamp-*

+# PWA Artifacts
+apple-splash-*.png
+apple-touch-icon-*.png
+favicon.ico
+favicon-dark.ico
+maskable-icon-*.png
+pwa-*.png
+
+# Storybook
 *storybook.log
 storybook-static
 *.code-workspace
@@ -77,6 +77,7 @@ add_custom_target(llama-ui-assets ALL
        "-DUI_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}"
        "-DUI_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}"
        "-DLLAMA_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
+        "-DLLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}"
        "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}"
        "-DHF_VERSION=${HF_UI_VERSION}"
        "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}"
@@ -4,8 +4,9 @@
 	"version": "1.0.0",
 	"type": "module",
 	"scripts": {
+		"build": "npm run build-pwa-assets && vite build",
+		"build-pwa-assets": "npx @vite-pwa/assets-generator --root . --config pwa-assets.config.ts && npx @vite-pwa/assets-generator --root . --config pwa-assets-dark.config.ts && node scripts/make-icons-circular.js",
 		"dev": "bash scripts/dev.sh",
-		"build": "vite build",
 		"preview": "vite preview",
 		"prepare": "svelte-kit sync || echo ''",
 		"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
@@ -15,12 +16,15 @@
 		"lint": "prettier --check . && eslint .",
 		"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e",
 		"test:e2e": "playwright test",
+		"test:e2e:pwa": "playwright test tests/e2e/pwa.e2e.ts",
 		"test:client": "vitest --project=client",
 		"test:unit": "vitest --project=unit",
+		"test:unit:pwa": "vitest --project=unit --run tests/unit/pwa.spec.ts",
+		"test:pwa": "npm run test:unit:pwa && npm run test:e2e:pwa",
 		"test:ui": "vitest --project=ui",
 		"storybook": "storybook dev -p 6006",
 		"build-storybook": "storybook build",
-		"cleanup": "rm -rf .svelte-kit build node_modules test-results"
+		"cleanup": "rm -rf .svelte-kit build node_modules test-results dist dev-dist debug-storybook.log static/pwa-*.png static/maskable-icon-*.png static/apple-touch-icon-*.png static/apple-splash-*.png static/favicon*.ico"
 	},
 	"devDependencies": {
 		"@chromatic-com/storybook": "5.0.0",
@@ -41,29 +45,31 @@
 		"@tailwindcss/forms": "0.5.10",
 		"@tailwindcss/typography": "0.5.16",
 		"@tailwindcss/vite": "4.1.11",
-		"@types/node": "^24",
+		"@types/node": "24.13.0",
+		"@vite-pwa/assets-generator": "1.0.2",
+		"@vite-pwa/sveltekit": "1.1.0",
 		"@vitest/browser": "4.1.8",
 		"@vitest/browser-playwright": "4.1.8",
 		"@vitest/coverage-v8": "4.1.8",
 		"bits-ui": "2.18.1",
 		"clsx": "2.1.1",
-		"dexie": "4.0.11",
-		"eslint": "9.39.2",
+		"dexie": "4.4.3",
+		"eslint": "9.39.4",
 		"eslint-config-prettier": "10.1.8",
-		"eslint-plugin-storybook": "10.2.4",
-		"eslint-plugin-svelte": "3.15.0",
-		"globals": "16.3.0",
+		"eslint-plugin-storybook": "10.4.2",
+		"eslint-plugin-svelte": "3.19.0",
+		"globals": "16.5.0",
 		"highlight.js": "11.11.1",
 		"http-server": "14.1.1",
 		"mdast": "3.0.0",
-		"mdsvex": "0.12.6",
+		"mdsvex": "0.12.7",
 		"mermaid": "11.15.0",
 		"mode-watcher": "1.1.0",
 		"pdfjs-dist": "5.4.54",
 		"playwright": "1.56.1",
-		"prettier": "3.6.2",
-		"prettier-plugin-svelte": "3.4.0",
-		"prettier-plugin-tailwindcss": "0.6.14",
+		"prettier": "3.8.3",
+		"prettier-plugin-svelte": "4.1.0",
+		"prettier-plugin-tailwindcss": "0.8.0",
 		"rehype-highlight": "7.0.2",
 		"rehype-katex": "7.0.1",
 		"rehype-stringify": "10.0.1",
@@ -73,25 +79,25 @@
 		"remark-html": "16.0.1",
 		"remark-math": "6.0.0",
 		"remark-rehype": "11.1.2",
-		"sass": "1.93.3",
-		"storybook": "10.3.3",
-		"svelte": "5.55.7",
-		"svelte-check": "4.3.0",
-		"svelte-sonner": "1.0.5",
-		"tailwind-merge": "3.3.1",
+		"sass": "1.100.0",
+		"storybook": "10.4.2",
+		"svelte": "5.56.1",
+		"svelte-check": "4.6.0",
+		"svelte-sonner": "1.1.1",
+		"tailwind-merge": "3.6.0",
 		"tailwind-variants": "3.2.2",
-		"tailwindcss": "4.1.11",
-		"tw-animate-css": "1.3.5",
-		"typescript": "5.8.3",
-		"typescript-eslint": "8.56.0",
+		"tailwindcss": "4.3.0",
+		"tw-animate-css": "1.4.0",
+		"typescript": "5.9.3",
+		"typescript-eslint": "8.60.1",
 		"unified": "11.0.5",
-		"unist-util-visit": "5.0.0",
+		"unist-util-visit": "5.1.0",
 		"uuid": "13.0.2",
-		"vite": "7.3.2",
+		"vite": "7.3.5",
 		"vite-plugin-devtools-json": "0.2.1",
 		"vitest": "4.1.8",
 		"vitest-browser-svelte": "2.1.1",
-		"zod": "4.2.1"
+		"workbox-window": "7.4.1"
 	},
 	"overrides": {
 		"cookie": "1.1.1"
@@ -1,11 +1,31 @@
-import { defineConfig } from '@playwright/test';
+import { defineConfig, devices } from '@playwright/test';

 export default defineConfig({
+	testDir: 'tests/e2e',
+	testMatch: ['**/*.e2e.ts'],
+	timeout: 30000,
+	expect: {
+		timeout: 5000
+	},
+	fullyParallel: true,
+	forbidOnly: !!process.env.CI,
+	retries: process.env.CI ? 2 : 0,
+	workers: process.env.CI ? 1 : undefined,
+	reporter: 'line',
+	use: {
+		baseURL: 'http://localhost:8181',
+		trace: 'on-first-retry'
+	},
+	projects: [
+		{
+			name: 'chromium',
+			use: { ...devices['Desktop Chrome'] }
+		}
+	],
 	webServer: {
 		command: 'npm run build && npx http-server ./dist -p 8181',
 		port: 8181,
 		timeout: 120000,
-		reuseExistingServer: false
-	},
-	testDir: 'tests/e2e'
+		reuseExistingServer: !process.env.CI
+	}
 });
@@ -0,0 +1,20 @@
+import { defineConfig } from '@vite-pwa/assets-generator/config';
+
+export default defineConfig({
+	headLinkOptions: {
+		preset: '2023'
+	},
+	preset: {
+		transparent: {
+			sizes: [],
+			favicons: [[48, 'favicon-dark.ico']]
+		},
+		maskable: {
+			sizes: []
+		},
+		apple: {
+			sizes: []
+		}
+	},
+	images: ['static/favicon-dark.svg']
+});
@@ -0,0 +1,51 @@
+import {
+	combinePresetAndAppleSplashScreens,
+	defineConfig,
+	minimal2023Preset
+} from '@vite-pwa/assets-generator/config';
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+import { THEME_COLORS, PWA_GENERATOR_DEVICES, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
+import { SplashOrientation } from './src/lib/enums/splash.enums';
+
+export default defineConfig({
+	headLinkOptions: {
+		preset: PWA_ASSET_GENERATOR.LINK_PRESET
+	},
+	preset: combinePresetAndAppleSplashScreens(
+		minimal2023Preset,
+		{
+			padding: PWA_ASSET_GENERATOR.SPLASH_PADDING,
+			resizeOptions: {
+				background: THEME_COLORS.BACKGROUND_LIGHT,
+				fit: PWA_ASSET_GENERATOR.FIT_MODE
+			},
+			darkResizeOptions: {
+				background: THEME_COLORS.BACKGROUND_DARK,
+				fit: PWA_ASSET_GENERATOR.FIT_MODE
+			},
+			darkImageResolver: async (imageName: string) => {
+				if (imageName.endsWith('favicon.svg')) {
+					return readFileSync(resolve('static/favicon-dark.svg'));
+				}
+			},
+			linkMediaOptions: {
+				log: true,
+				addMediaScreen: PWA_ASSET_GENERATOR.ADD_MEDIA_SCREEN,
+				basePath: PWA_ASSET_GENERATOR.BASE_PATH,
+				xhtml: PWA_ASSET_GENERATOR.XHTML
+			},
+			png: {
+				compressionLevel: PWA_ASSET_GENERATOR.PNG_COMPRESSION_LEVEL,
+				quality: PWA_ASSET_GENERATOR.PNG_QUALITY
+			},
+			name: (landscape, size, dark) => {
+				const orientation = landscape ? SplashOrientation.LANDSCAPE : SplashOrientation.PORTRAIT;
+				const darkPrefix = dark ? PWA_ASSET_GENERATOR.DARK_PREFIX : '';
+				return `apple-splash-${orientation}-${darkPrefix}${size.width}x${size.height}.png`;
+			}
+		},
+		PWA_GENERATOR_DEVICES
+	),
+	images: ['static/favicon.svg']
+});
@@ -0,0 +1,137 @@
+#!/usr/bin/env node
+
+/**
+ * Apply circular mask to pwa-*.png icons.
+ * Uses the maskable icon as source (white bg, full logo) to avoid
+ * the small-colormap pwa icons looking bad when cropped to a circle.
+ *
+ * Usage: node scripts/make-icons-circular.js [--padding-pct <0-50>] [--scale-pct <50-100>]
+ *
+ * - padding-pct: percentage of icon size kept as padding around the circle (default: 25)
+ * - scale-pct: scale down the source image before cropping (default: 85)
+ *
+ * maskable-icon and apple-touch-icon are left untouched.
+ */
+
+import sharp from 'sharp';
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+const STATIC_DIR = path.resolve(__dirname, '..', 'static');
+
+const paddingPct = process.argv.reduce((acc, arg, i, args) => {
+	if (arg === '--padding-pct' && args[i + 1]) return parseFloat(args[i + 1]);
+	return acc;
+}, 0);
+
+// Scale down the source image before cropping to circle
+const scalePct = process.argv.reduce((acc, arg, i, args) => {
+	if (arg === '--scale-pct' && args[i + 1]) return parseFloat(args[i + 1]);
+	return acc;
+}, 85); // default 85% - icon fills 85% of the circular area
+
+// Source for circular icons: the maskable icon (white bg, full logo)
+const sourceIcon = 'maskable-icon-512x512.png';
+const targetIcons = ['pwa-64x64.png', 'pwa-192x192.png', 'pwa-512x512.png'];
+
+// maskable-icon and apple-touch-icon stay square
+const untouchedIcons = ['maskable-icon-512x512.png', 'apple-touch-icon-180x180.png'];
+
+async function makeCircle(targetFilename) {
+	const targetPath = path.join(STATIC_DIR, targetFilename);
+	const sourcePath = path.join(STATIC_DIR, sourceIcon);
+
+	if (!fs.existsSync(sourcePath)) {
+		console.log(`⏭️  ${sourceIcon} not found, skipping`);
+		return;
+	}
+	if (!fs.existsSync(targetPath)) {
+		console.log(`⏭️  ${targetFilename} not found, skipping`);
+		return;
+	}
+
+	const metadata = await sharp(targetPath).metadata();
+	const size = Math.max(metadata.width, metadata.height);
+	const radius = Math.floor((size * (1 - paddingPct / 100)) / 2);
+	const center = Math.floor(size / 2);
+
+	// Build circular mask as RGBA buffer: white opaque circle on transparent bg
+	const maskBuf = Buffer.alloc(size * size * 4, 0);
+	for (let y = 0; y < size; y++) {
+		for (let x = 0; x < size; x++) {
+			const dx = x - center;
+			const dy = y - center;
+			const dist = Math.sqrt(dx * dx + dy * dy);
+			if (dist < radius) {
+				const i = (y * size + x) * 4;
+				maskBuf[i] = 255;
+				maskBuf[i + 1] = 255;
+				maskBuf[i + 2] = 255;
+				maskBuf[i + 3] = 255;
+			}
+		}
+	}
+
+	const tmpMask = path.join(STATIC_DIR, '.mask-tmp.png');
+	await sharp(maskBuf, {
+		raw: { width: size, height: size, channels: 4 }
+	})
+		.png()
+		.toFile(tmpMask);
+
+	// Step 1: Scale source relative to circle diameter (not full icon), composite centered onto white canvas of full size
+	const circleDiameter = Math.floor(size * (1 - paddingPct / 100));
+	const scaledSize = Math.floor((circleDiameter * scalePct) / 100);
+	const offset = Math.floor((size - scaledSize) / 2);
+
+	const scaledBuf = await sharp(sourcePath)
+		.resize(scaledSize, scaledSize, {
+			fit: 'cover',
+			background: { r: 255, g: 255, b: 255, alpha: 1 }
+		})
+		.ensureAlpha()
+		.png()
+		.toBuffer();
+
+	// Step 2: Composite scaled image onto white background, then apply circular mask
+	const output = await sharp({
+		create: {
+			width: size,
+			height: size,
+			channels: 4,
+			background: { r: 255, g: 255, b: 255, alpha: 1 }
+		}
+	})
+		.composite([
+			{ input: scaledBuf, top: offset, left: offset },
+			{ input: tmpMask, top: 0, left: 0, blend: 'dest-in' }
+		])
+		.png()
+		.toBuffer();
+
+	fs.writeFileSync(targetPath, output);
+	fs.unlinkSync(tmpMask);
+
+	console.log(
+		`✓ ${targetFilename} → circle from ${sourceIcon}, ${paddingPct}% padding (size=${size}, r=${radius}, scale=${scalePct}%, circleDiameter=${circleDiameter})`
+	);
+}
+
+async function main() {
+	console.log(`Circular mask: ${paddingPct}% padding, ${scalePct}% scale, source=${sourceIcon}\n`);
+	for (const icon of targetIcons) {
+		await makeCircle(icon);
+	}
+
+	console.log('\nUnchanged:');
+	for (const icon of untouchedIcons) {
+		const fp = path.join(STATIC_DIR, icon);
+		console.log(`  ${icon} (${fs.existsSync(fp) ? fs.statSync(fp).size + ' bytes' : 'missing'})`);
+	}
+}
+
+main();
@@ -0,0 +1,42 @@
+import { writeFileSync, existsSync } from 'node:fs';
+import { resolve } from 'path';
+import type { Plugin } from 'vite';
+import { BUILD_CONFIG } from '../src/lib/constants/pwa';
+
+let processed = false;
+
+const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
+
+/**
+ * Write build.json with the llama.cpp release build number.
+ *
+ * LLAMA_BUILD_NUMBER is passed from CMake -> npm -> vite via env var.
+ * Used for display of the current llama-server release (e.g. "b1234").
+ */
+export function buildInfoPlugin(): Plugin {
+	return {
+		name: 'llamacpp:build-info',
+		apply: 'build',
+		closeBundle() {
+			setTimeout(() => {
+				try {
+					if (processed) return;
+					processed = true;
+
+					const buildNumber = process.env.LLAMA_BUILD_NUMBER;
+					if (!buildNumber) return;
+
+					const outDir = resolve(OUTPUT_DIR);
+					const indexPath = resolve(outDir, 'index.html');
+					if (!existsSync(indexPath)) return;
+
+					const buildJsonPath = resolve(outDir, 'build.json');
+					writeFileSync(buildJsonPath, JSON.stringify({ version: buildNumber }), 'utf-8');
+					console.log(`Created build.json (version: ${buildNumber})`);
+				} catch (error) {
+					console.error('Failed to write build.json:', error);
+				}
+			}, 100);
+		}
+	};
+}
@@ -1,105 +0,0 @@
-import {
-	readFileSync,
-	writeFileSync,
-	existsSync,
-	readdirSync,
-	copyFileSync,
-	rmSync,
-	unlinkSync
-} from 'fs';
-import { resolve } from 'path';
-import type { Plugin } from 'vite';
-
-const GUIDE_FOR_FRONTEND = `
-<!--
-  This is a static build of the frontend.
-  It is automatically generated by the build process.
-  Do not edit this file directly.
-  To make changes, refer to the "Web UI" section in the README.
-->
-`.trim();
-
-const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? './dist';
-
-export function llamaCppBuildPlugin(): Plugin {
-	return {
-		name: 'llamacpp:build',
-		apply: 'build',
-		closeBundle() {
-			setTimeout(() => {
-				try {
-					const outDir = resolve(OUTPUT_DIR);
-					const indexPath = resolve(outDir, 'index.html');
-					if (!existsSync(indexPath)) return;
-
-					let content = readFileSync(indexPath, 'utf-8');
-
-					// Inline favicon as base64 data URL
-					const faviconPath = resolve('static/favicon.svg');
-					if (existsSync(faviconPath)) {
-						const faviconContent = readFileSync(faviconPath, 'utf-8');
-						const faviconBase64 = Buffer.from(faviconContent).toString('base64');
-						const faviconDataUrl = `data:image/svg+xml;base64,${faviconBase64}`;
-						content = content.replace(/href="[^"]*favicon\.svg"/g, `href="${faviconDataUrl}"`);
-						console.log('✓ Inlined favicon.svg as base64 data URL');
-					}
-
-					content = content.replace(/\r/g, '');
-					content = GUIDE_FOR_FRONTEND + '\n' + content;
-
-					// Keep the Vite hash as a query string so each build busts the browser cache
-					content = content.replace(/\/_app\/immutable\/bundle\.([^".]+)\.js/g, './bundle.js?$1');
-					content = content.replace(
-						/\/_app\/immutable\/assets\/bundle\.([^".]+)\.css/g,
-						'./bundle.css?$1'
-					);
-					content = content.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__');
-
-					writeFileSync(indexPath, content, 'utf-8');
-					console.log('✓ Updated index.html');
-
-					// Copy bundle.*.js -> bundle.js at output root
-					const immutableDir = resolve(outDir, '_app/immutable');
-					const bundleDir = resolve(outDir, '_app/immutable/assets');
-
-					if (existsSync(immutableDir)) {
-						const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/));
-						if (jsFiles.length > 0) {
-							copyFileSync(resolve(immutableDir, jsFiles[0]), resolve(outDir, 'bundle.js'));
-							// Normalize __sveltekit_<hash> to __sveltekit__ in bundle.js
-							const bundleJsPath = resolve(outDir, 'bundle.js');
-							let bundleJs = readFileSync(bundleJsPath, 'utf-8');
-							bundleJs = bundleJs.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__');
-							writeFileSync(bundleJsPath, bundleJs, 'utf-8');
-							console.log(`✓ Copied ${jsFiles[0]} -> bundle.js`);
-						}
-					}
-
-					// Copy bundle.*.css -> bundle.css at output root
-					if (existsSync(bundleDir)) {
-						const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/));
-						if (cssFiles.length > 0) {
-							copyFileSync(resolve(bundleDir, cssFiles[0]), resolve(outDir, 'bundle.css'));
-							console.log(`✓ Copied ${cssFiles[0]} -> bundle.css`);
-						}
-					}
-
-					// Cleanup: remove _app directory, favicon.svg, and legacy index.html.gz
-					const appDir = resolve(outDir, '_app');
-					if (existsSync(appDir)) {
-						rmSync(appDir, { recursive: true, force: true });
-						console.log('✓ Removed _app directory');
-					}
-
-					const faviconOut = resolve(outDir, 'favicon.svg');
-					if (existsSync(faviconOut)) {
-						unlinkSync(faviconOut);
-						console.log('✓ Removed favicon.svg');
-					}
-				} catch (error) {
-					console.error('Failed to process build output:', error);
-				}
-			}, 100);
-		}
-	};
-}
@@ -0,0 +1,61 @@
+import { readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { resolve } from 'path';
+import type { Plugin } from 'vite';
+import { BUILD_CONFIG } from '../src/lib/constants/pwa';
+
+let processed = false;
+
+const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
+
+function rewrite(path: string, pairs: [string, string][]): void {
+	if (!existsSync(path)) {
+		return;
+	}
+	const text = readFileSync(path, 'utf-8');
+	let out = text;
+	for (const [from, to] of pairs) {
+		out = out.split(from).join(to);
+	}
+	if (out !== text) {
+		writeFileSync(path, out, 'utf-8');
+	}
+}
+
+/**
+ * Relativize SvelteKit absolute base refs so the build is relocatable under any subpath.
+ *
+ * SvelteKit bakes root absolute /_app/ paths into the SPA fallback because paths.relative
+ * does not apply to a depth agnostic fallback page. Rewriting to ./_app/ lets a plain
+ * recursive copy of the output into /any/subdir/ resolve assets against the document URL.
+ * Runs after adapter-static writes index.html and the PWA plugin writes sw.js, deferred the
+ * same way as buildInfoPlugin so the emitted files exist.
+ */
+export function relativizeBasePlugin(): Plugin {
+	return {
+		name: 'llamacpp:relativize-base',
+		apply: 'build',
+		closeBundle() {
+			setTimeout(() => {
+				try {
+					if (processed) return;
+					processed = true;
+
+					const outDir = resolve(OUTPUT_DIR);
+
+					// index.html: modulepreload, stylesheet and bootstrap import reference "/_app/
+					rewrite(resolve(outDir, 'index.html'), [['"/_app/', '"./_app/']]);
+
+					// sw.js: the only absolute entries are the navigate fallback precache key and handler
+					rewrite(resolve(outDir, 'sw.js'), [
+						['{url:"/"', '{url:"./"'],
+						['createHandlerBoundToURL("/"', 'createHandlerBoundToURL("./"']
+					]);
+
+					console.log('Relativized base refs in index.html and sw.js');
+				} catch (error) {
+					console.error('Failed to relativize base refs:', error);
+				}
+			}, 100);
+		}
+	};
+}
@@ -0,0 +1,115 @@
+import { readdirSync, readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { resolve } from 'path';
+import type { Plugin } from 'vite';
+import { TAB, NEWLINE } from '../src/lib/constants/code';
+import { APPLE_DEVICES, BUILD_CONFIG, REGEX_PATTERNS, SPLASH_LINK } from '../src/lib/constants/pwa';
+import type { SplashDimensions } from '../src/lib/types';
+import { SplashOrientation } from '../src/lib/enums/splash.enums';
+
+let processed = false;
+
+const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
+
+/**
+ * Generate iOS splash screen <link> tags from generated apple-splash-*.png files.
+ * Returns an array of HTML link strings to be injected into the page head.
+ */
+export function generateSplashScreenLinks(outDir: string): string[] {
+	const files = readdirSync(outDir).filter((f) => f.match(REGEX_PATTERNS.SPLASH_FILE));
+	if (files.length === 0) return [];
+
+	const dimMap = new Map<string, SplashDimensions>();
+	for (const [dims, spec] of Object.entries(APPLE_DEVICES)) {
+		const [w, h] = dims.split('x').map(Number);
+		// logical-point dimensions
+		dimMap.set(`${w}x${h}`, { deviceW: spec.width, deviceH: spec.height, dpr: spec.dpr });
+		dimMap.set(`${h}x${w}`, { deviceW: spec.width, deviceH: spec.height, dpr: spec.dpr });
+		// pixel dimensions (used by actual generated splash files)
+		dimMap.set(`${w * spec.dpr}x${h * spec.dpr}`, {
+			deviceW: spec.width,
+			deviceH: spec.height,
+			dpr: spec.dpr
+		});
+		dimMap.set(`${h * spec.dpr}x${w * spec.dpr}`, {
+			deviceW: spec.width,
+			deviceH: spec.height,
+			dpr: spec.dpr
+		});
+	}
+
+	const lightLinks: string[] = [];
+	const darkLinks: string[] = [];
+
+	for (const file of files) {
+		const match = file.match(REGEX_PATTERNS.SPLASH_FILE);
+		if (!match) continue;
+		const orientation = match[1] as SplashOrientation;
+		const isDark = !!match[2];
+		const pixelW = parseInt(match[3]);
+		const pixelH = parseInt(match[4]);
+
+		const key = `${pixelW}x${pixelH}`;
+		const spec = dimMap.get(key);
+		if (!spec) {
+			console.warn(`Unknown splash screen dimensions: ${key} (${file})`);
+			continue;
+		}
+
+		const { deviceW, deviceH, dpr } = spec;
+		const media = `screen and (device-width: ${deviceW}px) and (device-height: ${deviceH}px) and (-webkit-device-pixel-ratio: ${dpr}) and (orientation: ${orientation})`;
+		const href = `./${file}`;
+
+		if (isDark) {
+			darkLinks.push(
+				`${SPLASH_LINK.HTML} media="${media}${SPLASH_LINK.DARK_MEDIA_SUFFIX}" href="${href}">`
+			);
+		} else {
+			lightLinks.push(`${SPLASH_LINK.HTML} media="${media}" href="${href}">`);
+		}
+	}
+
+	return [...lightLinks, ...darkLinks];
+}
+
+export function splashScreenPlugin(): Plugin {
+	return {
+		name: 'llamacpp:splash-screen',
+		apply: 'build',
+		closeBundle() {
+			setTimeout(() => {
+				try {
+					if (processed) return;
+					processed = true;
+
+					const outDir = resolve(OUTPUT_DIR);
+					const indexPath = resolve(outDir, 'index.html');
+					if (!existsSync(indexPath)) return;
+
+					let content = readFileSync(indexPath, 'utf-8');
+
+					// Inject iOS splash screen <link> tags into <head>.
+					// The @vite-pwa/assets-generator generates apple-splash-*.png files;
+					// this scans them and creates the <link> tags SvelteKit needs.
+					const splashLinks = generateSplashScreenLinks(outDir);
+					if (splashLinks.length > 0) {
+						console.log(`Generated ${splashLinks.length} apple-splash link tags`);
+						const splashHtml = splashLinks.map((l) => TAB + TAB + l).join(NEWLINE);
+						content = content.replace(
+							REGEX_PATTERNS.HEAD_CLOSE,
+							splashHtml + NEWLINE + TAB + TAB + '</head>'
+						);
+					}
+
+					// Remove trailing \r from Windows line endings
+					content = content.replace(/\r/g, '');
+					content = BUILD_CONFIG.GUIDE_COMMENT + NEWLINE + content;
+
+					writeFileSync(indexPath, content, 'utf-8');
+					console.log('Updated index.html');
+				} catch (error) {
+					console.error('Failed to process build output:', error);
+				}
+			}, 100);
+		}
+	};
+}
@@ -1,6 +1,9 @@
 // See https://svelte.dev/docs/kit/types#app.d.ts
 // for information about these interfaces

+import 'vite-plugin-pwa/pwa-assets';
+import 'vite-plugin-pwa/svelte';
+
 // Import chat types from dedicated module

 import type {
@@ -2,10 +2,17 @@
 <html lang="en">
 	<head>
 		<meta charset="utf-8" />
-		<link rel="icon" href="%sveltekit.assets%/favicon.svg" />
+		<link rel="icon" href="favicon.ico" sizes="48x48" />
+		<link rel="icon" href="favicon.svg" sizes="any" type="image/svg+xml" />
+
+		<link rel="apple-touch-icon" href="apple-touch-icon-180x180.png" />
+
+		<link rel="manifest" href="./manifest.webmanifest" />
+
 		<meta name="viewport" content="width=device-width, initial-scale=1" />
 		%sveltekit.head%
 	</head>
+
 	<body data-sveltekit-preload-data="hover">
 		<div style="display: contents">%sveltekit.body%</div>
 	</body>
@@ -20,6 +20,8 @@
 	import { ColorMode } from '$lib/enums/ui.enums';
 	import { fade } from 'svelte/transition';
 	import { goto } from '$app/navigation';
+	import { Button } from '$lib/components/ui/button';
+	import { RefreshCw } from '@lucide/svelte';
 	import { page } from '$app/state';
 	import { setChatSettingsConfigContext } from '$lib/contexts';
 	import { settingsReferrer } from '$lib/stores/settings-referrer.svelte';
@@ -164,6 +166,15 @@
 								onConfigChange={handleConfigChange}
 								onThemeChange={handleThemeChange}
 							/>
+
+							{#if currentSection.title === SETTINGS_SECTION_TITLES.GENERAL}
+								<div class="flex justify-end">
+									<Button variant="outline" onclick={() => window.location.reload()}>
+										<RefreshCw class="h-3 w-3" />
+										Reload app
+									</Button>
+								</div>
+							{/if}
 						</div>
 					{/if}
 				</div>
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import { APPLE_META_TAGS, MEDIA_QUERIES, THEME_COLORS } from '$lib/constants/pwa';
+	import { APP_NAME } from '$lib/constants';
+
+	let { appName = APP_NAME } = $props();
+</script>
+
+<svelte:head>
+	<!-- Theme color for light/dark modes -->
+	<meta name="theme-color" content={THEME_COLORS.LIGHT} media={MEDIA_QUERIES.PREFERS_LIGHT} />
+	<meta name="theme-color" content={THEME_COLORS.DARK} media={MEDIA_QUERIES.PREFERS_DARK} />
+
+	<!-- Apple mobile web app meta tags -->
+	<meta
+		name={APPLE_META_TAGS.MOBILE_WEB_APP_CAPABLE.name}
+		content={APPLE_META_TAGS.MOBILE_WEB_APP_CAPABLE.content}
+	/>
+	<meta
+		name={APPLE_META_TAGS.STATUS_BAR_STYLE.name}
+		content={APPLE_META_TAGS.STATUS_BAR_STYLE.content}
+	/>
+	<meta name={APPLE_META_TAGS.MOBILE_WEB_APP_TITLE.name} content={appName} />
+</svelte:head>
@@ -0,0 +1,35 @@
+<script lang="ts">
+	import * as Card from '$lib/components/ui/card';
+	import { Button } from '$lib/components/ui/button';
+
+	let { needRefresh: needRefreshProp, updateServiceWorker, forceReload } = $props();
+	let needRefresh = $derived(needRefreshProp ?? false);
+</script>
+
+{#if needRefresh}
+	<Card.Root class="overflow-hidden gap-1 py-5">
+		<Card.Header class="px-5">
+			<Card.Title class="text-sm font-medium">Update available</Card.Title>
+		</Card.Header>
+
+		<Card.Content class="gap-6 grid px-5">
+			<p class="text-xs text-muted-foreground">A new version is available. Reload to update.</p>
+
+			<Button
+				class="justify-self-end-safe"
+				size="sm"
+				onclick={() => {
+					updateServiceWorker();
+
+					if (forceReload) {
+						window.location.reload();
+					}
+
+					needRefresh = false;
+				}}
+			>
+				Reload
+			</Button>
+		</Card.Content>
+	</Card.Root>
+{/if}
@@ -0,0 +1,2 @@
+export { default as PwaMetaTags } from './PwaMetaTags.svelte';
+export { default as PwaRefreshAlert } from './PwaRefreshAlert.svelte';
@@ -0,0 +1 @@
+export const APP_NAME = import.meta.env?.VITE_PUBLIC_APP_NAME || 'llama-ui';
@@ -1,4 +1,5 @@
 export const NEWLINE = '\n';
+export const TAB = '\t';
 export const DEFAULT_LANGUAGE = 'text';
 export const LANG_PATTERN = /^(\w*)\n?/;
 export const AMPERSAND_REGEX = /&/g;
@@ -3,6 +3,7 @@

 export * from './agentic';
 export * from './api-endpoints';
+export * from './app';
 export * from './attachment-labels';
 export * from './database';
 export * from './reasoning-effort';
@@ -36,6 +37,7 @@ export * from './message-export';
 export * from './model-id';
 export * from './precision';
 export * from './processing-info';
+export * from './pwa';
 export * from './routes';
 export * from './sandbox';
 export * from './settings-keys';
@@ -0,0 +1,352 @@
+/**
+ * Centralized PWA constants to avoid magic strings, regexes, and duplicated
+ * definitions across the codebase.
+ */
+
+import { APP_NAME } from './app';
+
+export const MEDIA_QUERIES = {
+	PREFERS_DARK: '(prefers-color-scheme: dark)',
+	PREFERS_LIGHT: '(prefers-color-scheme: light)'
+} as const;
+
+export const THEME_COLORS = {
+	LIGHT: '#ffffff',
+	DARK: '#0d0d0d',
+	ACCENT_BLUE: '#2563eb',
+	ACCENT_BLUE_HOVER: '#1d4ed8',
+	BACKGROUND_LIGHT: 'white',
+	BACKGROUND_DARK: '#111111',
+	TITLE_UPDATE_ALERT: {
+		BORDER_LIGHT: 'zinc-200',
+		BORDER_DARK: 'zinc-700',
+		BG_LIGHT: 'white',
+		BG_DARK: 'zinc-800',
+		TEXT_LIGHT: 'zinc-500',
+		TEXT_DARK: 'zinc-400'
+	}
+} as const;
+
+export const FAVICON_PATHS = {
+	ICO_LIGHT: 'favicon.ico',
+	ICO_DARK: 'favicon-dark.ico',
+	SVG_LIGHT: 'favicon.svg',
+	SVG_DARK: 'favicon-dark.svg'
+} as const;
+
+export const FAVICON_SELECTORS = {
+	ICO_48X48: 'link[rel="icon"][sizes="48x48"]',
+	SVG_ANY: 'link[rel="icon"][type="image/svg+xml"]'
+} as const;
+
+export const APPLE_ASSETS = {
+	TOUCH_ICON: 'apple-touch-icon-180x180.png'
+} as const;
+
+export const PWA_MANIFEST = {
+	name: APP_NAME,
+	short_name: APP_NAME,
+	description: 'Local AI chat interface powered by llama.cpp',
+	start_url: './',
+	display: 'standalone' as const,
+	background_color: THEME_COLORS.BACKGROUND_LIGHT,
+	theme_color: THEME_COLORS.BACKGROUND_LIGHT,
+	icons: [
+		{ src: 'pwa-64x64.png', sizes: '64x64', type: 'image/png' },
+		{ src: 'pwa-192x192.png', sizes: '192x192', type: 'image/png' },
+		{ src: 'pwa-512x512.png', sizes: '512x512', type: 'image/png', purpose: 'any' as const },
+		{
+			src: 'maskable-icon-512x512.png',
+			sizes: '512x512',
+			type: 'image/png',
+			purpose: 'maskable' as const
+		}
+	]
+};
+
+export const PWA_ICON_PATHS = {
+	PWA_64: '/pwa-64x64.png',
+	PWA_192: '/pwa-192x192.png',
+	PWA_512: '/pwa-512x512.png',
+	MASKABLE_512: '/maskable-icon-512x512.png'
+} as const;
+
+/** Apple device dimensions (logical points) and DPR, from Apple HIG. */
+export const APPLE_DEVICES = {
+	// iPhones (DPR 3)
+	'1170x2532': { width: 390, height: 844, dpr: 3 }, // iPhone 13, 15
+	'1179x2556': { width: 393, height: 852, dpr: 3 }, // iPhone 14, 15 Pro, 16
+	'1206x2622': { width: 402, height: 874, dpr: 3 }, // iPhone 16 Plus, 16e
+	'1284x2778': { width: 428, height: 926, dpr: 3 }, // iPhone 15 Plus
+	'1290x2796': { width: 430, height: 932, dpr: 3 }, // iPhone 15 Pro Max, 16 Pro
+	'1320x2868': { width: 440, height: 956, dpr: 3 }, // iPhone 16 Pro Max
+	'750x1334': { width: 375, height: 667, dpr: 2 }, // iPhone 6/7/8, 14
+	'640x1136': { width: 320, height: 568, dpr: 2 }, // iPhone 6/7/8 Plus
+	// iPads (DPR 2)
+	'1668x2388': { width: 834, height: 1194, dpr: 2 }, // iPad Air 11", iPad 11"
+	'2048x2732': { width: 1024, height: 1366, dpr: 2 }, // iPad Pro 12.9"
+	'1640x2360': { width: 820, height: 1180, dpr: 2 }, // iPad Air 10.9"
+	'1032x1376': { width: 1032, height: 1376, dpr: 2 }, // iPad Air 13"
+	'744x1133': { width: 376, height: 573, dpr: 2 } // iPad mini 8.3"
+} as const;
+
+export type AppleDeviceKey = keyof typeof APPLE_DEVICES;
+
+export const PWA_FILE_PATHS = {
+	MANIFEST: '/manifest.webmanifest',
+	SERVICE_WORKER: '/sw.js',
+	VERSION: '/version.json',
+	WORKBOX: '/workbox-<hash>.js'
+} as const;
+
+// Used by the server middleware to skip API key validation.
+// Keep in sync with tools/server/server-http.cpp public_endpoints list.
+
+export const PUBLIC_ENDPOINTS = [
+	'/health',
+	'/v1/health',
+	'/models',
+	'/v1/models',
+	'/props',
+	'/metrics',
+	'/',
+	'/index.html',
+
+	'/favicon.ico',
+	'/favicon-dark.ico',
+	'/favicon.svg',
+	'/favicon-dark.svg',
+	'/pwa-64x64.png',
+	'/pwa-192x192.png',
+	'/pwa-512x512.png',
+	'/maskable-icon-512x512.png',
+	'/apple-touch-icon-180x180.png',
+	'/apple-splash-portrait-640x1136.png',
+	'/apple-splash-landscape-640x1136.png',
+	'/apple-splash-portrait-750x1334.png',
+	'/apple-splash-landscape-750x1334.png',
+	'/apple-splash-portrait-1170x2532.png',
+	'/apple-splash-landscape-1170x2532.png',
+	'/apple-splash-portrait-1179x2556.png',
+	'/apple-splash-landscape-1179x2556.png',
+	'/apple-splash-portrait-1206x2622.png',
+	'/apple-splash-landscape-1206x2622.png',
+	'/apple-splash-portrait-1284x2778.png',
+	'/apple-splash-landscape-1284x2778.png',
+	'/apple-splash-portrait-1290x2796.png',
+	'/apple-splash-landscape-1290x2796.png',
+	'/apple-splash-portrait-1320x2868.png',
+	'/apple-splash-landscape-1320x2868.png',
+	'/apple-splash-portrait-1488x2266.png',
+	'/apple-splash-landscape-1488x2266.png',
+	'/apple-splash-portrait-1640x2360.png',
+	'/apple-splash-landscape-1640x2360.png',
+	'/apple-splash-portrait-1668x2388.png',
+	'/apple-splash-landscape-1668x2388.png',
+	'/apple-splash-portrait-2048x2732.png',
+	'/apple-splash-landscape-2048x2732.png',
+	'/apple-splash-portrait-dark-640x1136.png',
+	'/apple-splash-landscape-dark-640x1136.png',
+	'/apple-splash-portrait-dark-750x1334.png',
+	'/apple-splash-landscape-dark-750x1334.png',
+	'/apple-splash-portrait-dark-1170x2532.png',
+	'/apple-splash-landscape-dark-1170x2532.png',
+	'/apple-splash-portrait-dark-1179x2556.png',
+	'/apple-splash-landscape-dark-1179x2556.png',
+	'/apple-splash-portrait-dark-1206x2622.png',
+	'/apple-splash-landscape-dark-1206x2622.png',
+	'/apple-splash-portrait-dark-1284x2778.png',
+	'/apple-splash-landscape-dark-1284x2778.png',
+	'/apple-splash-portrait-dark-1290x2796.png',
+	'/apple-splash-landscape-dark-1290x2796.png',
+	'/apple-splash-portrait-dark-1320x2868.png',
+	'/apple-splash-landscape-dark-1320x2868.png',
+	'/apple-splash-portrait-dark-1488x2266.png',
+	'/apple-splash-landscape-dark-1488x2266.png',
+	'/apple-splash-portrait-dark-1640x2360.png',
+	'/apple-splash-landscape-dark-1640x2360.png',
+	'/apple-splash-portrait-dark-1668x2388.png',
+	'/apple-splash-landscape-dark-1668x2388.png',
+	'/apple-splash-portrait-dark-2048x2732.png',
+	'/apple-splash-landscape-dark-2048x2732.png',
+	'/manifest.webmanifest',
+	'/sw.js',
+	'/version.json',
+	'/workbox-<hash>.js'
+] as const;
+export const BUILD_CONFIG = {
+	OUTPUT_DIR: './dist',
+	GUIDE_COMMENT: `
+<!--
+  This is a static build of the frontend.
+  It is automatically generated by the build process.
+  Do not edit this file directly.
+  To make changes, refer to the "Web UI" section in the README.
+-->
+`.trim()
+} as const;
+
+export const REGEX_PATTERNS = {
+	SPLASH_FILE: /^apple-splash-(portrait|landscape)-(dark-)?(\d+)x(\d+)\.png$/,
+	HEAD_CLOSE: /\t*<\/head>/
+} as const;
+
+// Device names used by @vite-pwa/assets-generator for splash screen generation.
+// Keep in sync with pwa-assets.config.ts.
+export const PWA_GENERATOR_DEVICES = [
+	'iPhone 13',
+	'iPhone 13 Pro',
+	'iPhone 13 Pro Max',
+	'iPhone 14',
+	'iPhone 14 Plus',
+	'iPhone 14 Pro',
+	'iPhone 14 Pro Max',
+	'iPhone 15',
+	'iPhone 15 Plus',
+	'iPhone 15 Pro',
+	'iPhone 15 Pro Max',
+	'iPhone 16',
+	'iPhone 16 Plus',
+	'iPhone 16 Pro',
+	'iPhone 16 Pro Max',
+	'iPhone 16e',
+	'iPhone SE 4"',
+	'iPhone SE 4.7"',
+	'iPad 11"',
+	'iPad Air 10.9"',
+	'iPad Air 11"',
+	'iPad Air 13"',
+	'iPad Pro 11"',
+	'iPad Pro 12.9"',
+	'iPad mini 8.3"'
+] as const;
+
+// PWA assets generator configuration — used by pwa-assets.config.ts
+export const PWA_ASSET_GENERATOR = {
+	LINK_PRESET: '2023',
+	SPLASH_PADDING: 0.75,
+	FIT_MODE: 'contain',
+	ADD_MEDIA_SCREEN: true,
+	BASE_PATH: './',
+	XHTML: false,
+	PNG_COMPRESSION_LEVEL: 9,
+	PNG_QUALITY: 60,
+	DARK_PREFIX: 'dark-'
+} as const;
+
+export const CACHE_SETTINGS = {
+	IMMUTABLE_MAX_AGE_SECONDS: 31536000,
+	API_CACHE_MAX_AGE_SECONDS: 60 * 60 * 24,
+	API_CACHE_MAX_ENTRIES: 50,
+	MAX_FILE_SIZE_BYTES: 10 * 1024 * 1024
+} as const;
+
+export const GLOB_PATTERNS: string[] = [
+	'**/*.{js,css,html,ico,svg,png,webp,woff,woff2,json,webmanifest}'
+];
+
+// loading.html is the model loading page served by llama-server itself.
+// The SvelteKit PWA manifest transform strips the html extension from every
+// precache entry to match clean URLs, but loading.html is a plain static asset
+// with no clean URL, so static servers answer 404 and the SW install fails.
+export const GLOB_IGNORES: string[] = ['**/loading.html'];
+
+export const SW_CONFIG = {
+	CHECK_INTERVAL_MS: 60000,
+	UPDATE_FETCH_OPTIONS: {
+		CACHE: 'no-store',
+		HEADERS: {
+			CACHE: 'no-store',
+			CACHE_CONTROL: 'no-cache'
+		}
+	}
+} as const;
+
+// Runtime caching configuration for Workbox
+export const RUNTIME_CACHING = {
+	HANDLER: 'NetworkFirst',
+	CACHE_NAME: 'api-cache'
+} as const;
+
+// Workbox runtime caching patterns
+export const API_CACHING_PATTERNS = {
+	V1_API: /^\/v1\/.*/,
+	STATIC_API: /^\/(health|props|models|tools|slots|cors-proxy).*/
+} as const;
+
+// SvelteKit PWA plugin options
+export const PWA_KIT_OPTIONS = {
+	NAVIGATE_FALLBACK: './'
+} as const;
+
+export const APPLE_META_TAGS = {
+	MOBILE_WEB_APP_CAPABLE: { name: 'apple-mobile-web-app-capable', content: 'yes' },
+	STATUS_BAR_STYLE: { name: 'apple-mobile-web-app-status-bar-style', content: 'black-translucent' },
+	MOBILE_WEB_APP_TITLE: { name: 'apple-mobile-web-app-title' }
+} as const;
+
+// Splash screen HTML link tag prefix used by generateSplashScreenLinks
+export const SPLASH_LINK = {
+	HTML: '<link rel="apple-touch-startup-image"',
+	DARK_MEDIA_SUFFIX: ' and (prefers-color-scheme: dark)'
+} as const;
+
+// SvelteKit PWA plugin configuration — used by @vite.config.ts
+import type { SvelteKitPWAOptions } from '@vite-pwa/sveltekit';
+
+export const SVELTEKIT_PWA_OPTIONS: SvelteKitPWAOptions = {
+	// Strategy: generateSW - the plugin generates a service worker automatically
+	// using Workbox. For a custom SW, use 'injectManifest' instead.
+	// Manifest configuration
+	manifest: PWA_MANIFEST,
+
+	// Workbox configuration for generateSW strategy
+	workbox: {
+		// Match all static assets in the build output.
+		// Uses '**/' because SvelteKit outputs files under _app/immutable/
+		// subdirectories.
+		globPatterns: GLOB_PATTERNS,
+		globIgnores: GLOB_IGNORES,
+		maximumFileSizeToCacheInBytes: CACHE_SETTINGS.MAX_FILE_SIZE_BYTES,
+
+		// Runtime caching for API calls - use NetworkFirst so APIs are always fresh
+		runtimeCaching: [
+			{
+				urlPattern: API_CACHING_PATTERNS.V1_API,
+				handler: RUNTIME_CACHING.HANDLER,
+				options: {
+					cacheName: RUNTIME_CACHING.CACHE_NAME,
+					expiration: {
+						maxEntries: CACHE_SETTINGS.API_CACHE_MAX_ENTRIES,
+						maxAgeSeconds: CACHE_SETTINGS.API_CACHE_MAX_AGE_SECONDS
+					}
+				}
+			},
+			{
+				urlPattern: API_CACHING_PATTERNS.STATIC_API,
+				handler: RUNTIME_CACHING.HANDLER,
+				options: {
+					cacheName: RUNTIME_CACHING.CACHE_NAME,
+					expiration: {
+						maxEntries: CACHE_SETTINGS.API_CACHE_MAX_ENTRIES,
+						maxAgeSeconds: CACHE_SETTINGS.API_CACHE_MAX_AGE_SECONDS
+					}
+				}
+			}
+		]
+	},
+
+	devOptions: {
+		enabled: true,
+		suppressWarnings: true,
+		// Use PWA_KIT_OPTIONS.NAVIGATE_FALLBACK to match production SW behaviour
+		// (navigateFallback defaults to the configured base path, which is '/' for this SPA).
+		navigateFallback: PWA_KIT_OPTIONS.NAVIGATE_FALLBACK
+	},
+
+	// SvelteKit-specific options
+	kit: {
+		// Include version file for proper cache invalidation
+		includeVersionFile: true
+	}
+};
@@ -31,6 +31,7 @@ export const SETTINGS_KEYS = {
 	SHOW_RAW_MODEL_NAMES: 'showRawModelNames',
 	SHOW_MODEL_QUANTIZATION: 'showModelQuantization',
 	SHOW_MODEL_TAGS: 'showModelTags',
+	SHOW_BUILD_VERSION: 'showBuildVersion',
 	SHOW_SYSTEM_MESSAGE: 'showSystemMessage',
 	// Sampling
 	TEMPERATURE: 'temperature',
@@ -365,6 +365,14 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 					serverKey: SETTINGS_KEYS.ALWAYS_SHOW_AGENTIC_TURNS,
 					paramType: SyncableParameterType.BOOLEAN
 				}
+			},
+			{
+				key: SETTINGS_KEYS.SHOW_BUILD_VERSION,
+				label: 'Show build version information',
+				help: 'Display the current build version in the bottom-right corner of the interface.',
+				defaultValue: false,
+				type: SettingsFieldType.CHECKBOX,
+				section: SETTINGS_SECTION_SLUGS.DISPLAY
 			}
 		]
 	},
@@ -40,6 +40,9 @@ export const DEPRECATED_MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = `${STORAGE_APP_NA
 /** @deprecated Use {@link USER_OVERRIDES_LOCALSTORAGE_KEY} instead */
 export const DEPRECATED_USER_OVERRIDES_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME_DEPRECATED}.userOverrides`;

+/** Build version stored in localStorage for non-PWA update detection */
+export const BUILD_VERSION_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.buildVersion`;
+
 /** Maps new keys to their deprecated fallback keys */
 export const NEW_TO_DEPRECATED_MAP: Record<string, string> = {
 	[ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY]: DEPRECATED_ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY,
@@ -5,7 +5,6 @@ import { ROUTES } from './routes';

 export const FORK_TREE_DEPTH_PADDING = 8;
 export const SYSTEM_MESSAGE_PLACEHOLDER = 'System message';
-export const APP_NAME = import.meta.env.VITE_PUBLIC_APP_NAME || 'llama-ui';

 export const ICON_STRIP_TRANSITION_DURATION = 150;
 export const ICON_STRIP_TRANSITION_DELAY_MULTIPLIER = 50;
@@ -63,3 +63,5 @@ export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol }
 export { KeyboardKey } from './keyboard.enums';

 export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools.enums';
+
+export { SplashOrientation } from './splash.enums';
@@ -0,0 +1,7 @@
+/**
+ * Splash screen orientation for iOS apple-touch-startup-image
+ */
+export enum SplashOrientation {
+	PORTRAIT = 'portrait',
+	LANDSCAPE = 'landscape'
+}
@@ -0,0 +1,80 @@
+import { browser } from '$app/environment';
+import { useRegisterSW } from 'virtual:pwa-register/svelte';
+import { versionStore } from '$lib/stores/version.svelte';
+import { BUILD_VERSION_LOCALSTORAGE_KEY } from '$lib/constants/storage';
+import { SW_CONFIG } from '$lib/constants/pwa';
+
+/**
+ * Hook for PWA service worker registration, update polling, and build version mismatch detection.
+ *
+ * Combines two concerns that always belong together:
+ * 1. SW registration with periodic polling for updates
+ * 2. localStorage-based version tracking for non-PWA users
+ */
+export function usePwa() {
+	let swCheckInterval: ReturnType<typeof setInterval> | null = null;
+	let needRefreshByStorage = $state(false);
+
+	const {
+		// offlineReady, // to do - add installation banners for iOS
+		needRefresh: pwaNeedRefresh,
+		updateServiceWorker
+	} = useRegisterSW({
+		onRegisteredSW(swUrl: string, r: ServiceWorkerRegistration | undefined) {
+			if (swCheckInterval) {
+				clearInterval(swCheckInterval);
+			}
+			swCheckInterval = setInterval(async () => {
+				if (!r || r.installing || !navigator?.onLine) return;
+
+				try {
+					const resp = await fetch(swUrl, {
+						cache: SW_CONFIG.UPDATE_FETCH_OPTIONS.CACHE,
+						headers: {
+							cache: SW_CONFIG.UPDATE_FETCH_OPTIONS.HEADERS.CACHE,
+							'cache-control': SW_CONFIG.UPDATE_FETCH_OPTIONS.HEADERS.CACHE_CONTROL
+						}
+					});
+					if (resp?.status === 200) {
+						await r.update();
+					}
+				} catch (e) {
+					console.error(e);
+				}
+			}, SW_CONFIG.CHECK_INTERVAL_MS);
+		},
+		onRegisterError(error: unknown) {
+			console.error('[PWA] SW registration error:', error);
+		}
+	});
+
+	// Detect version mismatch via localStorage.
+	// _app/version.json is SvelteKit's native version file for PWA cache invalidation.
+	// This comparison detects server upgrades for non-PWA users.
+	$effect(() => {
+		if (!browser) return;
+
+		const currentVersion = versionStore.value;
+		if (!currentVersion) return;
+
+		try {
+			const storedVersion = localStorage.getItem(BUILD_VERSION_LOCALSTORAGE_KEY);
+			needRefreshByStorage = !!storedVersion && storedVersion !== currentVersion;
+			localStorage.setItem(BUILD_VERSION_LOCALSTORAGE_KEY, currentVersion);
+		} catch {
+			needRefreshByStorage = false;
+		}
+	});
+
+	return {
+		/** Writable that is true when a PWA service worker update is available */
+		get needRefresh() {
+			return pwaNeedRefresh;
+		},
+		updateServiceWorker,
+		/** Version mismatch detected via localStorage (non-PWA users) */
+		get needRefreshByStorage() {
+			return needRefreshByStorage;
+		}
+	};
+}
@@ -0,0 +1,42 @@
+/**
+ * buildInfoStore - llama.cpp build information
+ *
+ * Reads the build version from `build.json` — embedded at llama.cpp build time
+ * with the llama.cpp build number (LLAMA_BUILD_NUMBER). Shown in the UI when
+ * `showBuildVersion` is enabled.
+ *
+ * In dev mode (via `npm run dev`), falls back to `import.meta.env.DEV`'s truthy
+ * value since the artifact is not produced.
+ */
+
+import { browser } from '$app/environment';
+import { base } from '$app/paths';
+
+let build = $state<string>('');
+
+async function loadBuild() {
+	if (!browser) return;
+
+	if (import.meta.env.DEV) {
+		build = 'dev';
+		return;
+	}
+
+	try {
+		const res = await fetch(`${base}/build.json`, { cache: 'no-store' });
+		if (res.ok) {
+			const data = await res.json();
+			build = data.version ?? '';
+		}
+	} catch {
+		// build.json missing or unreachable - leave as empty string
+	}
+}
+
+loadBuild();
+
+export const buildInfoStore = {
+	get value(): string {
+		return build;
+	}
+};
@@ -489,7 +489,7 @@ class MCPStore {
 			if (!rootDomain) return null;

 			const origin = `${url.protocol}//${rootDomain}`;
-			const candidates = ['favicon.ico', 'favicon.svg', 'favicon.png'];
+			const candidates = ['favicon.ico', 'favicon.png'];

 			for (const path of candidates) {
 				const faviconUrl = `${origin}/${path}`;
@@ -0,0 +1,14 @@
+import { browser } from '$app/environment';
+import { MEDIA_QUERIES } from '$lib/constants';
+
+export const theme = $state({
+	isSystemDark: browser && window.matchMedia(MEDIA_QUERIES.PREFERS_DARK).matches
+});
+
+if (browser) {
+	const mql = window.matchMedia(MEDIA_QUERIES.PREFERS_DARK);
+
+	mql.addEventListener('change', (e) => {
+		theme.isSystemDark = e.matches;
+	});
+}
@@ -0,0 +1,41 @@
+/**
+ * versionStore - Frontend build version
+ *
+ * Reads from SvelteKit's `_app/version.json` — generated by the @vite-pwa/sveltekit
+ * plugin. The version string changes on every build, so comparing it against
+ * localStorage reliably detects server upgrades.
+ *
+ * In dev mode, falls back to `'dev'`.
+ */
+
+import { browser } from '$app/environment';
+import { base } from '$app/paths';
+
+let version = $state<string>('');
+
+async function loadVersion() {
+	if (!browser) return;
+
+	if (import.meta.env.DEV) {
+		version = 'dev';
+		return;
+	}
+
+	try {
+		const res = await fetch(`${base}/_app/version.json`, { cache: 'no-store' });
+		if (res.ok) {
+			const data = await res.json();
+			version = data.version ?? '';
+		}
+	} catch {
+		// _app/version.json missing or unreachable - leave as empty string
+	}
+}
+
+loadVersion();
+
+export const versionStore = {
+	get value(): string {
+		return version;
+	}
+};
@@ -165,3 +165,6 @@ export type { ToolEntry, ToolGroup } from './tools';

 // Reasoning
 export type { ReasoningEffortLevel } from './reasoning';
+
+// Splash
+export type { SplashDimensions } from './splash';
@@ -0,0 +1 @@
+export type SplashDimensions = { deviceW: number; deviceH: number; dpr: number };
@@ -57,7 +57,7 @@ export async function convertPDFToText(file: File): Promise<string> {

 	try {
 		const buffer = await getFileAsBuffer(file);
-		const pdf = await pdfjs.getDocument(buffer).promise;
+		const pdf = await pdfjs.getDocument({ data: buffer }).promise;
 		const numPages = pdf.numPages;

 		const textContentPromises: Promise<TextContent>[] = [];
@@ -94,7 +94,7 @@ export async function convertPDFToImage(file: File, scale: number = 1.5): Promis

 	try {
 		const buffer = await getFileAsBuffer(file);
-		const doc = await pdfjs.getDocument(buffer).promise;
+		const doc = await pdfjs.getDocument({ data: buffer }).promise;
 		const pages: Promise<string>[] = [];

 		for (let i = 1; i <= doc.numPages; i++) {
@@ -13,6 +13,8 @@
 		DialogConversationTitleUpdate,
 		SidebarNavigation
 	} from '$lib/components/app';
+	import { PwaMetaTags, PwaRefreshAlert } from '$lib/components/pwa';
+	import { pwaAssetsHead } from 'virtual:pwa-assets/head';

 	import { conversationsStore } from '$lib/stores/conversations.svelte';
 	import * as Sidebar from '$lib/components/ui/sidebar/index.js';
@@ -26,10 +28,16 @@
 	import { modelsStore } from '$lib/stores/models.svelte';
 	import { mcpStore } from '$lib/stores/mcp.svelte';
 	import { TOOLTIP_DELAY_DURATION } from '$lib/constants';
+	import { FAVICON_PATHS, FAVICON_SELECTORS } from '$lib/constants/pwa';
 	import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte';
+	import { usePwa } from '$lib/hooks/use-pwa.svelte';
 	import { useSettingsNavigation } from '$lib/hooks/use-settings-navigation.svelte';
 	import { conversations } from '$lib/stores/conversations.svelte';
 	import { isMobile } from '$lib/stores/viewport.svelte';
+	import { theme } from '$lib/stores/theme.svelte';
+	import { buildInfoStore } from '$lib/stores/build-info.svelte';
+
+	import { SETTINGS_KEYS } from '$lib/constants';

 	let { children } = $props();
 	let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
@@ -46,11 +54,31 @@
 		  }
 		| undefined = $state();

+	let showBuildVersion = $derived(config()[SETTINGS_KEYS.SHOW_BUILD_VERSION] as boolean);
+
 	let titleUpdateDialogOpen = $state(false);
 	let titleUpdateCurrentTitle = $state('');
 	let titleUpdateNewTitle = $state('');
 	let titleUpdateResolve: ((value: boolean) => void) | null = null;
+
 	const panelNav = useSettingsNavigation();
+	// Keep the hook object intact: destructuring needRefreshByStorage reads the getter once and freezes it
+	const pwa = usePwa();
+	const { needRefresh, updateServiceWorker } = pwa;
+
+	function updateFavicon() {
+		const dark = theme.isSystemDark;
+
+		let icoLink = document.querySelector(FAVICON_SELECTORS.ICO_48X48) as HTMLLinkElement | null;
+		if (icoLink) {
+			icoLink.href = dark ? FAVICON_PATHS.ICO_DARK : FAVICON_PATHS.ICO_LIGHT;
+		}
+
+		let svgLink = document.querySelector(FAVICON_SELECTORS.SVG_ANY) as HTMLLinkElement | null;
+		if (svgLink) {
+			svgLink.href = dark ? FAVICON_PATHS.SVG_DARK : FAVICON_PATHS.SVG_LIGHT;
+		}
+	}

 	function navigateToConversation(direction: -1 | 1) {
 		const allConvs = conversations();
@@ -137,9 +165,16 @@
 	}

 	onMount(() => {
+		updateFavicon();
 		mounted = true;
 	});

+	$effect(() => {
+		void theme.isSystemDark;
+
+		updateFavicon();
+	});
+
 	$effect(() => {
 		if (alwaysShowSidebarOnDesktop && isDesktop) {
 			sidebarOpen = true;
@@ -236,13 +271,36 @@
 </script>

 <svelte:head>
+	{#if pwaAssetsHead.themeColor}
+		<meta name="theme-color" content={pwaAssetsHead.themeColor.content} />
+	{/if}
+
 	{#if config().customCss}
 		<style use:customCss></style>
 	{/if}
+
+	{#each pwaAssetsHead.links as link (link.href)}
+		<link {...link} />
+	{/each}
+
+	<PwaMetaTags />
 </svelte:head>

+<!-- PWA update prompt + version -->
+<div class="fixed right-4 bottom-4 z-[9999] flex flex-col items-end gap-1">
+	{#if showBuildVersion && buildInfoStore.value}
+		<span class="text-[10px] tabular-nums text-muted-foreground">{buildInfoStore.value}</span>
+	{/if}
+	<PwaRefreshAlert
+		needRefresh={$needRefresh || pwa.needRefreshByStorage}
+		forceReload={pwa.needRefreshByStorage}
+		{updateServiceWorker}
+	/>
+</div>
+
 <Tooltip.Provider delayDuration={TOOLTIP_DELAY_DURATION}>
 	<ModeWatcher />
+
 	<Toaster richColors />

 	<DialogConversationTitleUpdate
@@ -254,7 +312,7 @@
 	/>

 	<Sidebar.Provider bind:open={sidebarOpen}>
-		<div class="flex h-dvh w-full">
+		<div class="flex h-screen w-full">
 			<Sidebar.Root variant="floating" class="h-full"
 				><SidebarNavigation bind:this={chatSidebar} /></Sidebar.Root
 			>
@@ -285,9 +343,9 @@
 				/>
 			{/if}

-			<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden"
-				>{@render children?.()}</Sidebar.Inset
-			>
+			<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
+				{@render children?.()}
+			</Sidebar.Inset>
 		</div>
 	</Sidebar.Provider>
 </Tooltip.Provider>
@@ -0,0 +1,14 @@
+<svg width="512" height="512" viewBox="0 0 512 512" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_29_291)">
+<path d="M244.95 8C215.233 8 187.774 23.8591 172.923 49.5999L95.6009 183.625C60.2162 244.959 104.481 321.6 175.29 321.6H208L316.977 132.708C348.959 77.2719 308.95 8 244.95 8ZM208 321.6H351.947C415.982 321.6 456.013 390.91 424.013 446.377C409.155 472.132 381.681 488 351.947 488H271.29C200.481 488 156.216 411.359 191.601 350.026L208 321.6Z" fill="#FAFAFA"/>
+<path d="M208 321.6H16L106.462 164.8L208 321.6Z" fill="#FAFAFA"/>
+<path d="M388.923 8L208 321.6L253.6 8H388.923Z" fill="#FAFAFA"/>
+<path d="M304 488H112L202.462 331.2L304 488Z" fill="#FAFAFA"/>
+<path d="M496 321.6H208L419.399 454.4L496 321.6Z" fill="#FAFAFA"/>
+</g>
+<defs>
+<clipPath id="clip0_29_291">
+<rect width="512" height="512" fill="white"/>
+</clipPath>
+</defs>
+</svg>
@@ -1 +1,14 @@
-<svg width="256" xmlns="http://www.w3.org/2000/svg" height="256" id="screenshot-ef94fbb0-dbab-80ed-8006-89429900edbf" viewBox="0 0 256 256" xmlns:xlink="http://www.w3.org/1999/xlink" fill="none" version="1.1"><g id="shape-ef94fbb0-dbab-80ed-8006-89429900edbf" rx="0" ry="0"><g id="shape-ef94fbb0-dbab-80ed-8006-894215755c3a"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-894215755c3a"><rect rx="0" ry="0" x="0" y="0" transform="matrix(1.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000)" width="256" height="256" style="fill: rgb(27, 31, 32); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef3f" rx="0" ry="0"><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef40"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef40"><path d="M171.66500854492188,99.5302505493164L159.79953002929688,120.62468719482422C144.15451049804688,108.58329010009766,120.9504165649414,106.8254165649414,105.3053970336914,119.7457504272461C80.0798110961914,140.57652282714844,81.8376235961914,188.7422637939453,121.1261978149414,189.00587463378906C132.11300659179688,189.00587463378906,141.42965698242188,183.8201141357422,151.44967651367188,180.39234924316406L156.72335815429688,201.3988494873047C147.84591674804688,205.52989196777344,138.79293823242188,209.7487335205078,129.03683471679688,211.06712341308594C40.08835220336914,223.1964569091797,45.18600845336914,94.78400421142578,125.6088638305664,88.10407257080078C142.48434448242188,86.69782257080078,157.33834838867188,91.09247589111328,171.75314331054688,99.5302505493164Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef41"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef41"><path d="M110.2272720336914,79.31470489501953C96.6918716430664,83.35785675048828,84.1232681274414,90.8288345336914,74.6305923461914,101.28812408447266C72.8727798461914,80.01782989501953,77.6188735961914,37.03793716430664,101.2621841430664,28.6001033782959C104.7780532836914,27.36964988708496,116.8195571899414,24.293371200561523,116.4679946899414,30.533788681030273C116.1161880493164,36.77426528930664,107.7663345336914,47.49722671508789,105.7450942993164,53.29823684692383C102.2292251586914,63.49386978149414,105.4811782836914,70.52535247802734,110.3154067993164,79.40265655517578Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef42"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef42"><path d="M143.62692260742188,127.65621185302734L143.62692260742188,143.47706604003906L157.68991088867188,143.47706604003906L157.68991088867188,155.7821807861328L143.62692260742188,155.7821807861328L143.62692260742188,170.7240753173828L130.44284057617188,170.7240753173828L130.44284057617188,155.7821807861328L115.5009536743164,155.7821807861328L115.5009536743164,143.47706604003906L129.12448120117188,143.47706604003906L130.44284057617188,142.15867614746094L130.44284057617188,127.65621185302734L143.62692260742188,127.65621185302734Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef43"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef43"><path d="M191.96823120117188,127.65621185302734L191.96823120117188,142.15867614746094L193.28683471679688,143.47706604003906L206.91036987304688,143.47706604003906L206.91036987304688,155.7821807861328L191.96823120117188,155.7821807861328L191.96823120117188,170.7240753173828L178.78439331054688,170.7240753173828L178.78439331054688,155.7821807861328L164.72140502929688,155.7821807861328L164.72140502929688,143.47706604003906L178.78439331054688,143.47706604003906L178.78439331054688,127.65621185302734L191.96823120117188,127.65621185302734Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef44"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef44"><path d="M153.20748901367188,38.092655181884766C154.96554565429688,40.72946548461914,145.03341674804688,52.06770706176758,143.45114135742188,54.96817398071289C138.88082885742188,63.581790924072266,141.95700073242188,68.50382232666016,145.38473510742188,76.67792510986328C135.45285034179688,75.18372344970703,126.2240982055664,76.41425323486328,116.3798599243164,77.55683135986328C118.5773696899414,58.659732818603516,129.21261596679688,31.1490535736084,153.20748901367188,38.092655181884766Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g></g></g></svg>
+<svg width="512" height="512" viewBox="0 0 512 512" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_29_291)">
+<path d="M244.95 8C215.233 8 187.774 23.8591 172.923 49.5999L95.6009 183.625C60.2162 244.959 104.481 321.6 175.29 321.6H208L316.977 132.708C348.959 77.2719 308.95 8 244.95 8ZM208 321.6H351.947C415.982 321.6 456.013 390.91 424.013 446.377C409.155 472.132 381.681 488 351.947 488H271.29C200.481 488 156.216 411.359 191.601 350.026L208 321.6Z" fill="#111111"/>
+<path d="M208 321.6H16L106.462 164.8L208 321.6Z" fill="#111111"/>
+<path d="M388.923 8L208 321.6L253.6 8H388.923Z" fill="#111111"/>
+<path d="M304 488H112L202.462 331.2L304 488Z" fill="#111111"/>
+<path d="M496 321.6H208L419.399 454.4L496 321.6Z" fill="#111111"/>
+</g>
+<defs>
+<clipPath id="clip0_29_291">
+<rect width="512" height="512" fill="white"/>
+</clipPath>
+</defs>
+</svg>
@@ -29,9 +29,6 @@ const config = {
 		},
 		alias: {
 			$styles: 'src/styles'
-		},
-		version: {
-			name: 'llama-ui'
 		}
 	},

@@ -1,7 +0,0 @@
-import { expect, test } from '@playwright/test';
-
-test('home page loads correctly', async ({ page }) => {
-	await page.goto('/');
-	// Wait for the greeting to become visible (stores need time to initialize)
-	await expect(page.locator('h1', { hasText: /Hello there/ })).toBeVisible();
-});
@@ -0,0 +1,106 @@
+import { expect, test } from '@playwright/test';
+
+test.describe('PWA Service Worker', () => {
+	test('service worker is registered', async ({ page }) => {
+		await page.goto('/');
+
+		const swURL = await page.evaluate(async () => {
+			const registration = await Promise.race([
+				// eslint-disable-next-line @typescript-eslint/ban-ts-comment
+				// @ts-ignore - type inference differs from browser runtime
+				navigator.serviceWorker.ready,
+				new Promise((_, reject) =>
+					setTimeout(() => reject(new Error('Service worker registration failed: timeout')), 15000)
+				)
+			]);
+			// @ts-expect-error registration is of type unknown
+			return registration.active?.scriptURL;
+		});
+
+		expect(swURL).toBeTruthy();
+		expect(swURL).toContain('/sw.js');
+	});
+
+	test('service worker has precache configured', async ({ page }) => {
+		await page.goto('/');
+
+		await page.evaluate(async () => {
+			await navigator.serviceWorker.ready;
+		});
+
+		const swActive = await page.evaluate(async () => {
+			const reg = await navigator.serviceWorker.ready;
+			return reg.active?.scriptURL ?? null;
+		});
+
+		expect(swActive).toBeTruthy();
+
+		const swResponse = await page.request.get(swActive!);
+		const swContent = await swResponse.text();
+
+		// Precache contains SvelteKit content-hashed bundle paths
+		expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
+		expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
+		expect(swContent).toMatch(/"manifest\.webmanifest"/);
+		expect(swContent).toMatch(/"_app\/version\.json"/);
+		expect(swContent).toMatch(/NavigationRoute/);
+		expect(swContent).toMatch(/api-cache/);
+	});
+
+	test('offline mode - page loads when offline after caching', async ({ browser }) => {
+		const context = await browser.newContext();
+		const offlinePage = await context.newPage();
+
+		await offlinePage.goto('/');
+		await offlinePage.waitForLoadState('networkidle');
+
+		await offlinePage.evaluate(async () => {
+			await navigator.serviceWorker.ready;
+		});
+
+		await offlinePage.waitForTimeout(2000);
+
+		await context.setOffline(true);
+		await offlinePage.goto('/');
+
+		const bodyText = await offlinePage.locator('body').textContent();
+		expect(bodyText).toBeTruthy();
+
+		await context.close();
+	});
+
+	test('version.json is accessible and contains version', async ({ page }) => {
+		const versionResponse = await page.request.get('/_app/version.json');
+		expect(versionResponse.ok()).toBeTruthy();
+
+		const versionData = await versionResponse.json();
+		expect(versionData).toHaveProperty('version');
+		expect(typeof versionData.version).toBe('string');
+		expect(versionData.version.length).toBeGreaterThan(0);
+	});
+
+	test('manifest.webmanifest is accessible and valid', async ({ page }) => {
+		const response = await page.request.get('/manifest.webmanifest');
+		expect(response.ok()).toBeTruthy();
+
+		const manifest = await response.json();
+		expect(manifest).toHaveProperty('name', 'llama-ui');
+		expect(manifest).toHaveProperty('short_name', 'llama-ui');
+		expect(manifest).toHaveProperty('start_url', './');
+		expect(manifest).toHaveProperty('display', 'standalone');
+		expect(manifest.icons).toBeTruthy();
+		expect(manifest.icons.length).toBeGreaterThan(0);
+	});
+
+	test('index.html contains content-hashed bundle references', async ({ page }) => {
+		const response = await page.request.get('/');
+		expect(response.ok()).toBeTruthy();
+
+		const html = await response.text();
+
+		// SvelteKit outputs content-hashed bundle names in _app/immutable/
+		expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
+		expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
+		expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"\)/);
+	});
+});
@@ -0,0 +1,57 @@
+<script module lang="ts">
+	import { defineMeta } from '@storybook/addon-svelte-csf';
+	import PwaRefreshAlert from '$lib/components/pwa/PwaRefreshAlert.svelte';
+	import { expect } from 'storybook/test';
+
+	const { Story } = defineMeta({
+		title: 'Components/PwaRefreshAlert',
+		component: PwaRefreshAlert,
+		parameters: {
+			layout: 'centered'
+		}
+	});
+</script>
+
+<Story
+	name="Default"
+	args={{ needRefresh: true, updateServiceWorker: () => console.log('reload') }}
+	play={async ({ canvas }) => {
+		const title = canvas.getByText('Update available');
+		await expect(title).toBeInTheDocument();
+
+		const description = canvas.getByText(/A new version is available/);
+		await expect(description).toBeInTheDocument();
+
+		const button = canvas.getByRole('button', { name: 'Reload' });
+		await expect(button).toBeInTheDocument();
+	}}
+/>
+
+<Story
+	name="Hidden"
+	args={{ needRefresh: false, updateServiceWorker: () => console.log('reload') }}
+	play={async ({ canvas }) => {
+		const title = canvas.queryByText('Update available');
+		await expect(title).not.toBeInTheDocument();
+	}}
+/>
+
+<Story
+	name="ClickReload"
+	args={{
+		needRefresh: true,
+		updateServiceWorker: () => console.log('reload')
+	}}
+	play={async ({ canvas, userEvent }) => {
+		const button = canvas.getByRole('button', { name: 'Reload' });
+		await expect(button).toBeInTheDocument();
+
+		await userEvent.click(button);
+
+		const title = canvas.queryByText('Update available');
+		await expect(title).not.toBeInTheDocument();
+
+		const reloadBtn = canvas.queryByRole('button', { name: 'Reload' });
+		await expect(reloadBtn).not.toBeInTheDocument();
+	}}
+/>
@@ -0,0 +1,195 @@
+import { existsSync, readFileSync, readdirSync } from 'node:fs';
+import { resolve } from 'node:path';
+import { describe, expect, it } from 'vitest';
+
+const DIST_DIR = resolve(__dirname, '../../dist');
+const distExists = existsSync(DIST_DIR);
+
+// PWA Build Output tests are integration tests that require a built dist/.
+// CI builds first then runs these tests; local devs should run `npm run build` or use `npm run test:pwa`.
+describe('PWA Build Output', () => {
+	if (!distExists) {
+		console.warn(`⚠ Skipping PWA Build Output tests - dist/ not found (run 'npm run build' first)`);
+		it('skipped - dist/ not found', () => {});
+		return;
+	}
+
+	const swContent = readFileSync(resolve(DIST_DIR, 'sw.js'), 'utf-8');
+	const indexContent = readFileSync(resolve(DIST_DIR, 'index.html'), 'utf-8');
+
+	describe('Core files exist', () => {
+		it('service worker (sw.js) exists', () => {
+			expect(existsSync(resolve(DIST_DIR, 'sw.js')), 'sw.js not found').toBeTruthy();
+		});
+
+		it('workbox library exists (hashed filename)', () => {
+			// SvelteKit generates workbox-{hash}.js files
+			const files = readdirSync(DIST_DIR).filter((f) => f.match(/^workbox-[^.]+\.js$/));
+			expect(files.length).toBeGreaterThan(0);
+		});
+
+		it('manifest.webmanifest exists', () => {
+			expect(
+				existsSync(resolve(DIST_DIR, 'manifest.webmanifest')),
+				'manifest.webmanifest not found'
+			).toBeTruthy();
+		});
+
+		it('SvelteKit bundle.js exists in _app/immutable/', () => {
+			// SvelteKit generates hashed bundle names in _app/immutable/
+			const appDir = resolve(DIST_DIR, '_app', 'immutable');
+			expect(existsSync(appDir), '_app/immutable/ not found').toBeTruthy();
+			const files = readdirSync(appDir).filter((f) => f.startsWith('bundle.') && f.endsWith('.js'));
+			expect(files.length).toBeGreaterThan(0);
+		});
+
+		it('SvelteKit bundle.css exists in _app/immutable/assets/', () => {
+			// SvelteKit generates hashed CSS bundles in _app/immutable/assets/
+			const cssDir = resolve(DIST_DIR, '_app', 'immutable', 'assets');
+			expect(existsSync(cssDir), '_app/immutable/assets/ not found').toBeTruthy();
+			const files = readdirSync(cssDir).filter(
+				(f) => f.startsWith('bundle.') && f.endsWith('.css')
+			);
+			expect(files.length).toBeGreaterThan(0);
+		});
+
+		it('version.json exists in _app/', () => {
+			// SvelteKit stores version.json in _app directory
+			expect(
+				existsSync(resolve(DIST_DIR, '_app', 'version.json')),
+				'_app/version.json not found'
+			).toBeTruthy();
+		});
+	});
+
+	describe('version.json content', () => {
+		it('has valid JSON with version field', () => {
+			const content = readFileSync(resolve(DIST_DIR, '_app', 'version.json'), 'utf-8');
+			const parsed = JSON.parse(content);
+			expect(parsed).toHaveProperty('version');
+			expect(typeof parsed.version).toBe('string');
+			expect(parsed.version.length).toBeGreaterThan(0);
+		});
+	});
+
+	describe('Service worker content', () => {
+		it('service worker has minified self.define format', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit's workbox-plugin-sveltekit produces a minified SW with self.define
+			expect(swContent).toMatch(/if\(!self.define\)/);
+		});
+
+		it('references hashed workbox file (SvelteKit build output)', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit's workbox-plugin-sveltekit references hashed workbox files
+			expect(swContent).toMatch(/define\(\["\.\/workbox-[a-zA-Z0-9]+"\]/);
+		});
+
+		it('precache contains SvelteKit bundle.js with content hash', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit uses content-hashed bundle names in _app/immutable/
+			expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
+		});
+
+		it('precache contains SvelteKit bundle.css with content hash', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit uses content-hashed CSS bundle names in _app/immutable/assets/
+			expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
+		});
+
+		it('precache contains _app/version.json', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit stores version.json in _app directory
+			expect(swContent).toMatch(/"_app\/version\.json"/);
+		});
+
+		it('precache contains manifest.webmanifest', () => {
+			expect(swContent).toBeTruthy();
+			expect(swContent).toMatch(/"manifest\.webmanifest"/);
+		});
+
+		it('has navigation route registered', () => {
+			expect(swContent).toBeTruthy();
+			expect(swContent).toMatch(/NavigationRoute/);
+		});
+
+		it('has runtime caching for API routes', () => {
+			expect(swContent).toBeTruthy();
+			expect(swContent).toMatch(/api-cache/);
+			expect(swContent).toMatch(/NetworkFirst/);
+		});
+	});
+
+	describe('index.html content', () => {
+		it('has modulepreload link for SvelteKit bundle with content hash', () => {
+			expect(indexContent).toBeTruthy();
+			// SvelteKit generates hashed bundle names in _app/immutable/
+			expect(indexContent).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
+		});
+
+		it('has stylesheet link for SvelteKit bundle.css with content hash', () => {
+			expect(indexContent).toBeTruthy();
+			expect(indexContent).toMatch(
+				/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/
+			);
+		});
+
+		it('has dynamic import for SvelteKit bundle with content hash', () => {
+			expect(indexContent).toBeTruthy();
+			expect(indexContent).toMatch(
+				/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"\)/
+			);
+		});
+
+		it('has __sveltekit__ variable (SvelteKit adds hash suffix)', () => {
+			expect(indexContent).toBeTruthy();
+			// SvelteKit 2.x uses __sveltekit__ as base with random suffix
+			expect(indexContent).toMatch(/__sveltekit_[a-zA-Z0-9-]+/);
+		});
+
+		it('has PWA manifest link', () => {
+			expect(indexContent).toBeTruthy();
+			expect(indexContent).toMatch(/rel="manifest" href="(\.?\/)?manifest\.webmanifest"/);
+		});
+
+		it('has apple-touch-icon link', () => {
+			expect(indexContent).toBeTruthy();
+			expect(indexContent).toMatch(/rel="apple-touch-icon"/);
+		});
+
+		it('has _app paths for SvelteKit bundles', () => {
+			expect(indexContent).toBeTruthy();
+			// SvelteKit uses _app paths for hashed assets
+			expect(indexContent).toMatch(/_app\//);
+		});
+	});
+
+	describe('SvelteKit _app directory', () => {
+		it('_app directory exists (SvelteKit uses it for hashed assets)', () => {
+			expect(existsSync(resolve(DIST_DIR, '_app'))).toBeTruthy();
+		});
+	});
+
+	describe('Hashed workbox files', () => {
+		it('workbox-*.js files exist in dist root (SvelteKit build output)', () => {
+			const files = readdirSync(DIST_DIR).filter((f) => f.match(/^workbox-[^.]+\.js$/));
+			expect(files.length).toBeGreaterThan(0);
+		});
+	});
+
+	describe('Static assets', () => {
+		it('has favicon.ico', () => {
+			expect(existsSync(resolve(DIST_DIR, 'favicon.ico'))).toBeTruthy();
+		});
+
+		it('has PWA icons', () => {
+			expect(existsSync(resolve(DIST_DIR, 'pwa-64x64.png'))).toBeTruthy();
+			expect(existsSync(resolve(DIST_DIR, 'pwa-192x192.png'))).toBeTruthy();
+			expect(existsSync(resolve(DIST_DIR, 'pwa-512x512.png'))).toBeTruthy();
+		});
+
+		it('has loading.html fallback page', () => {
+			expect(existsSync(resolve(DIST_DIR, 'loading.html'))).toBeTruthy();
+		});
+	});
+});
@@ -1,13 +1,16 @@
 import tailwindcss from '@tailwindcss/vite';
 import { sveltekit } from '@sveltejs/kit/vite';
+import { SvelteKitPWA } from '@vite-pwa/sveltekit';
 import { dirname, resolve } from 'path';
 import { fileURLToPath } from 'url';

 import { defineConfig, searchForWorkspaceRoot } from 'vite';
-import devtoolsJson from 'vite-plugin-devtools-json';
 import { storybookTest } from '@storybook/addon-vitest/vitest-plugin';
-import { llamaCppBuildPlugin } from './scripts/vite-plugin-llama-cpp-build';
+import { splashScreenPlugin } from './scripts/vite-plugin-splash-screen';
+import { buildInfoPlugin } from './scripts/vite-plugin-build-info';
+import { relativizeBasePlugin } from './scripts/vite-plugin-relativize-base';
 import { playwright } from '@vitest/browser-playwright';
+import { SVELTEKIT_PWA_OPTIONS } from './src/lib/constants/pwa';

 const __dirname = dirname(fileURLToPath(import.meta.url));

@@ -37,7 +40,14 @@ export default defineConfig({
 		minify: true
 	},

-	plugins: [tailwindcss(), sveltekit(), devtoolsJson(), llamaCppBuildPlugin()],
+	plugins: [
+		tailwindcss(),
+		sveltekit(),
+		SvelteKitPWA(SVELTEKIT_PWA_OPTIONS),
+		splashScreenPlugin(),
+		buildInfoPlugin(),
+		relativizeBasePlugin()
+	],

 	test: {
 		projects: [
Author	SHA1	Message	Date
Xuan-Son Nguyen	e37abd6b5f	mtmd: add batching API (#24384 ) * mtmd: add batching API * wip * first working version (gemma4v) * add arg * nits * wire up support_batch() * fix 0.0 output embd * fix audio * nits * refactor a bit * nits * fix non-batching case * fix comment	2026-06-13 00:10:29 +02:00
Sigbjørn Skjæret	f58bad4137	ci : unbreak release harder (#24545 ) * unbreak release harder * missed one * remove missing test for now	2026-06-12 23:49:36 +02:00
Sigbjørn Skjæret	cd5044661c	ci : unbreak release (#24544 )	2026-06-12 23:29:49 +03:00
Georgi Gerganov	ebc10770ac	server : fix reasoning budget WebUI precedence over model.ini (#24517 ) When reasoning-budget is set in model.ini, the per-request thinking_budget_tokens from the WebUI was ignored because the model.ini value took unconditional precedence. Swap the precedence so the WebUI per-request value is checked first, with the model.ini value serving as a fallback default. Assisted-by: pi:llama.cpp/Qwen3.6-27B	2026-06-12 17:59:56 +03:00
Ruben Ortlam	3e7bd4f39a	vulkan: add pipeline barriers for memcpy read operations (#23770 ) * vulkan: add pipeline barriers for memcpy read/write operations * remove unnecessary host write pipeline barriers	2026-06-12 16:43:50 +02:00
Aleksander Grygier	f7ca93d12c	ui: PWA support (#23871 ) * feat: Add basic PWA support and service worker for offline caching * feat: Vite PWA implementation WIP * feat: Improve PWA icons generation * feat: Add PWA workbox to server routes * feat: Include `version.json` in static assets * feat: Add HTTP cache headers for PWA static assets * feat: Update app name for `apple-mobile-web-app-title` * feat: Implement PWA versioning and automatic update detection * chore: Update `.gitignore` files * feat: Splash Screens * feat: Add dark mode favicon support * refactor: Cleanup * fix: Use dark logo for dark splash screens * refactor: Simplify favicons SVG code * fix: Adjust caching and polling for reliable service worker updates * fix: Add missing favicon entry * fix: Align PWA service worker configuration with SvelteKit build structure * fix: Replace hashed bundle paths with versioned static paths * test: Add PWA tests * ci: Add build output for unit tests * refactor: Cleanup * fix: Server build & release versioning * chore: Update package-lock.json * chore: Increase PWA cache size * chore: Update packages * feat: Update favicons * refactor: Post-merge fix * feat: support explicit build version for PWA cache busting * fix: CI * feat: Improve PWA Refresh Alert UI * feat: Add toggleable build version display * refactor: Cleanup * feat: Add version mismatch detection and manual app reload * refactor: replace dynamic imports with static * refactor: Cleanup * feat: Add safe space for `pwa-<size>.png` rendered icons * fix: use relative paths for PWA assets to support base path deployment * feat: add PWA mode detection via URL query parameter * feat: Use ?cache=true for SW-cached PWA assets * refactor: Build process cleanup * refactor: Decouple PWA versioning and remove ?cache=true workaround * chore: Update README logo * feat: Include PWA Assets generation in build script * refactor: `usePwa` hook for core layout * fix: Relativize base vite plugin * fix: remove unnecessary backslash escapes in test regexes * test: update static asset paths for API Key test * refactor: Move SvelteKit PWA Options config to constants * ui: fix update notification never appearing Keep the PWA hook object intact instead of destructuring needRefreshByStorage, which freezes the reactive getter. Also exclude loading.html from PWA precache to prevent 404 errors and broken SW installation.	2026-06-12 15:53:26 +02:00
Georgi Gerganov	02182fc5b9	fit : avoid including llama-ext.h in fit.h (#24506 )	2026-06-12 15:57:05 +03:00
				`@@ -0,0 +1 @@`
				`export const APP_NAME = import.meta.env?.VITE_PUBLIC_APP_NAME \|\| 'llama-ui';`
				`@@ -0,0 +1 @@`
				`export type SplashDimensions = { deviceW: number; deviceH: number; dpr: number };`