vulkan: add pipeline barriers for memcpy read operations (#23770 )

* vulkan: add pipeline barriers for memcpy read/write operations * remove unnecessary host write pipeline barriers
ui: PWA support (#23871 )
2026-06-12 16:56:43 +02:00 · 2026-06-12 16:43:50 +02:00 · 2026-06-12 15:53:26 +02:00 · 2026-06-12 15:57:05 +03:00 · 2026-06-12 15:55:35 +03:00 · 2026-06-12 15:55:35 +03:00
94 changed files with 10735 additions and 1628 deletions
@@ -59,8 +59,31 @@ jobs:
            echo "should_release=false" >> $GITHUB_OUTPUT
          fi

+  get-version:
+    runs-on: ubuntu-slim
+    outputs:
+      ui_version: ${{ steps.version.outputs.ui_version }}
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - id: version
+        run: |
+          # Resolve UI version: BUILD_NUMBER from cmake/build-info.cmake > git hash + epoch > fallback
+          version=""
+          if grep -q "BUILD_NUMBER" cmake/build-info.cmake; then
+            build_number=$(grep "set(BUILD_NUMBER" cmake/build-info.cmake | grep -oP '\d+')
+            if [ -n "$build_number" ] && [ "$build_number" -gt 0 ]; then
+              version="b${build_number}"
+            fi
+          fi
+          if [ -z "$version" ]; then
+            version=$(git rev-parse --short HEAD)-$(date +%s)
+          fi
+          echo "ui_version=${version}" >> $GITHUB_OUTPUT
+
  macos-cpu:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    strategy:
      matrix:
@@ -116,6 +139,7 @@ jobs:
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

@@ -141,7 +165,7 @@ jobs:
          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    strategy:
      matrix:
@@ -201,6 +225,7 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -227,7 +252,7 @@ jobs:
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-vulkan:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    strategy:
@@ -287,6 +312,7 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DGGML_VULKAN=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -312,7 +338,7 @@ jobs:
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  android-arm64:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-latest
@@ -379,6 +405,7 @@ jobs:
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -404,7 +431,7 @@ jobs:
          name: llama-bin-android-arm64.tar.gz

  ubuntu-24-openvino:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-24.04
@@ -476,7 +503,8 @@ jobs:
          source ./openvino_toolkit/setupvars.sh
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
+            -DGGML_OPENVINO=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
          cmake --build build/ReleaseOV --config Release -j $(nproc)

      - name: ccache-clear
@@ -952,7 +980,7 @@ jobs:
          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz

  ubuntu-22-rocm:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-22.04
@@ -1044,6 +1072,7 @@ jobs:
            -DGGML_HIP=ON \
            -DHIP_PLATFORM=amd \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -1072,7 +1101,7 @@ jobs:
          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: windows-2022
@@ -1168,6 +1197,7 @@ jobs:
            -DGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} `
            -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
@@ -1195,7 +1225,7 @@ jobs:
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip

  ios-xcode:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    runs-on: macos-26

@@ -1225,6 +1255,7 @@ jobs:
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

      - name: xcodebuild for swift package
@@ -1344,10 +1375,12 @@ jobs:
 #          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 #          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz

-  ui:
-    needs: [check-release]
+  ui-build:
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    uses: ./.github/workflows/ui-build.yml
+    with:
+      hf_ui_version: ${{ needs.get-version.outputs.ui_version }}

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -1360,6 +1393,7 @@ jobs:
    runs-on: ubuntu-slim

    needs:
+      - get-version
      - windows
      - windows-cpu
      - windows-cuda
@@ -2,6 +2,11 @@ name: UI Build

 on:
  workflow_call:
+    inputs:
+      hf_ui_version:
+        description: 'Version string for version.json (e.g. 12345)'
+        required: false
+        type: string

 jobs:
  build:
@@ -25,9 +30,20 @@ jobs:
        working-directory: tools/ui

      - name: Build application
+        env:
+          HF_UI_VERSION: ${{ inputs.hf_ui_version || '' }}
+          LLAMA_UI_VERSION: ${{ inputs.hf_ui_version || 'b0000' }}
        run: npm run build
        working-directory: tools/ui

+      - name: Run PWA unit tests (versioned build output)
+        run: npx vitest --project=unit --run tests/unit/pwa.spec.ts
+        working-directory: tools/ui
+
+      - name: Run build-utils unit tests (both paths)
+        run: npx vitest --project=unit --run tests/unit/build-utils.spec.ts
+        working-directory: tools/ui
+
      - name: Generate checksums
        run: |
          cd tools/ui/dist
@@ -1,8 +1,8 @@
 name: UI (self-hosted)

 # these are the same as ui.yml, but with self-hosted runners
-# the runners come with pre-installed Playwright browsers version: 1.56.1
-# the jobs are much lighter because they don't need to install node and playwright browsers
+# the jobs are lighter because they don't need to install Node.js or Playwright browsers
+# the runner has pre-installed Playwright browsers for @playwright/test (1.56.1) at /ms-playwright/

 on:
  workflow_dispatch:
@@ -61,6 +61,12 @@ jobs:
        run: npm ci
        working-directory: tools/ui

+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+
      - name: Run type checking
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run check
@@ -72,12 +78,12 @@ jobs:
        working-directory: tools/ui

      - name: Run Client tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:client
        working-directory: tools/ui

      - name: Run Unit tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:unit
        working-directory: tools/ui

@@ -97,22 +103,23 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/ui
+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/

      - name: Build Storybook
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run build-storybook
        working-directory: tools/ui

      - name: Run UI tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/ui

      - name: Run E2E tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/ui
@@ -43,7 +43,7 @@ jobs:
  ui-checks:
    name: Checks
    needs: ui-build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -60,6 +60,12 @@ jobs:
          cache: "npm"
          cache-dependency-path: "tools/ui/package-lock.json"

+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+
      - name: Install dependencies
        id: setup
        if: ${{ steps.node.conclusion == 'success' }}
@@ -87,7 +93,7 @@ jobs:
        run: npm run test:client
        working-directory: tools/ui

-      - name: Run Unit tests
+      - name: Run Unit tests (uses pre-built dist/ from ui-build)
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:unit
        working-directory: tools/ui
@@ -95,7 +101,7 @@ jobs:
  e2e-tests:
    name: E2E Tests
    needs: ui-build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
@@ -117,10 +123,11 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/ui
+      - name: Download built UI artifacts (reuses ui-build)
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/

      - name: Install Playwright browsers
        id: playwright
@@ -138,7 +145,7 @@ jobs:
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/ui

-      - name: Run E2E tests
+      - name: Run E2E tests (uses pre-built dist/ from ui-build)
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/ui
@@ -92,13 +92,6 @@
 !/examples/sycl/*.bat
 !/examples/sycl/*.sh

-# Server Web UI temporary files (+ legacy directory)
-
-/tools/server/webui/node_modules
-/tools/server/webui/dist
-/tools/ui/node_modules
-/tools/ui/dist
-
 # Python

 /.venv
@@ -1,6 +1,6 @@
 # llama.cpp

-![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
+![llama](https://raw.githubusercontent.com/ggml-org/llama.brand/refs/heads/master/cover/llama-cpp/cover-llama-cpp-dark.svg)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
@@ -1,9 +1,7 @@
 #pragma once

 #include "ggml.h"
-#include "ggml-backend.h"
 #include "llama.h"
-#include "../src/llama-ext.h"

 #include <vector>

@@ -18,31 +16,35 @@ enum common_params_fit_status {
 //   - this function is NOT thread safe because it modifies the global llama logger state
 //   - only parameters that have the same value as in llama_default_model_params are modified
 //     with the exception of the context size which is modified if and only if equal to 0
-enum common_params_fit_status common_fit_params(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams,
-                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                     size_t * margins,               // margins of memory to leave per device in bytes
-                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+common_params_fit_status common_fit_params(
+                         const char * path_model,
+                 llama_model_params * mparams,
+               llama_context_params * cparams,
+                              float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+   llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                             size_t * margins,               // margins of memory to leave per device in bytes
+                           uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                     ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log

 // print estimated memory to stdout
 void common_fit_print(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams);
+                         const char * path_model,
+                 llama_model_params * mparams,
+               llama_context_params * cparams);

-void common_memory_breakdown_print(const struct llama_context * ctx);
+void common_memory_breakdown_print(const llama_context * ctx);
+
+// TODO: convert this to common_device_memory_data that wraps llama_device_memory_data
+//       add API for accessing the internal `llama-ext.h` information
+struct llama_device_memory_data;

 // Load a model + context with no_alloc and return the per-device memory breakdown.
 std::vector<llama_device_memory_data> common_get_device_memory_data(
-                                  const char   * path_model,
-        const struct llama_model_params         * mparams,
-        const struct llama_context_params       * cparams,
-        std::vector<ggml_backend_dev_t>         & devs,
-                                      uint32_t  & hp_ngl,
-                                      uint32_t  & hp_n_ctx_train,
-                                      uint32_t  & hp_n_expert,
-                           enum ggml_log_level    log_level);
+                         const char * path_model,
+           const llama_model_params * mparams,
+         const llama_context_params * cparams,
+    std::vector<ggml_backend_dev_t> & devs,
+                           uint32_t & hp_ngl,
+                           uint32_t & hp_n_ctx_train,
+                           uint32_t & hp_n_expert,
+                     ggml_log_level   log_level);
@@ -375,31 +375,437 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
    }
 };

+
+// EAGLE3 speculative decoding state
+//
+// Input of draft decoder: (This is different compared to MTP)
+//   At "pos P", the decoder takes input pair (t_{P+1}, g_P), with RoPE at P.
+//     - t_{P+1} = token at sequence pos P+1 (the *next* token after P)
+//     - g_P     = encoder output = projection of target's extracted hidden states at P
+//
+// Deferred boundary (MTP doesn't have this issue):
+//   Within a single process() call with n_tokens, we can only write decoder KV for
+//   training pos 0..n_tokens-2. The last training pos (n_tokens-1) needs t_{n_tokens}
+//   which lies *outside* this batch — it is the token target will sample next or the first token from next ubatch.
+//   So the last training pos of each process() call is *deferred* to whichever next call has
+//   the missing token in hand:
+//     - multi-ubatch prefill: the next process()'s first token completes the pair
+//                              (handled by the per-seq "cross-ubatch bridge")
+//     - single-ubatch prefill / after verify: draft()'s seed step uses "dp.id_last"
+//                              (target's freshest sample) to complete the pair
+//
+// Per-seq carry-over state:
+//   pending_g_last    [n_embd_dec]  ┐  the deferred boundary's (g, pos). Set by
+//   pending_pos_last  llama_pos     ┘  process() at end of ubatch (= last row);
+//                                       rebased by accept() to first-non-accepted pos.
+//   verify_g          [N × n_embd_dec] snapshot of process()'s encoder output;
+//   verify_pos_first  llama_pos         consumed by accept() to recover the right
+//   verify_g_rows     int32_t           pending_g_last row for any n_accepted value.
+//
+// Performance is overall good but there is waste in verify cycle:
+//   process() runs encoder + decoder on the *full* verify batch including rows for
+//   rejected drafts. The KV at those positions is then dropped.
+//
+// TODO: Not sure if we need optimization for this waste?
+// If so we may need hybrid stash:
+//      in verify mode, have process() only stash features and let draft() seed run
+//      encoder+decoder on n_accepted+1 rows).
 struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
-    //common_params_speculative_eagle3 params;
+    common_params_speculative_draft params;
+    llama_batch batch;
+
+    std::vector<common_sampler_ptr> smpls;
+
+    int32_t n_embd_dec = 0;       // draft hidden size
+    int32_t n_embd_enc = 0;       // target_layer_ids_n * target_hidden_size
+    int32_t n_embd_tgt = 0;       // target model hidden size
+
+    const int32_t * target_layer_ids   = nullptr; // model_dft's extract layer indices
+    uint32_t        target_layer_ids_n = 0;
+
+    // [per-seq] deferred boundary state
+    std::vector<std::vector<float>> pending_g_last;
+    std::vector<llama_pos>          pending_pos_last;
+
+    // [per-seq] snapshot of the most recent process()'s encoder output
+    std::vector<std::vector<float>> verify_g;         // [n_seq][n_rows * n_embd_dec]
+    std::vector<llama_pos>          verify_pos_first; // [n_seq] — pos of verify_g[seq][0]
+    std::vector<int32_t>            verify_g_rows;    // [n_seq] — number of rows
+
+    // scratch buffer for concatenated target features [n_tokens, n_embd_enc]
+    std::vector<float> features_buf;
+    std::vector<float> g_embd_buf;

    common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
+        , params(params.draft)
    {
        LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
+
+        auto * ctx_tgt = this->params.ctx_tgt;
+        auto * ctx_dft = this->params.ctx_dft;
+        GGML_ASSERT(ctx_tgt && ctx_dft && "EAGLE3 requires ctx_tgt and ctx_dft to be set");
+
+        const llama_model * model_dft = llama_get_model(ctx_dft);
+        const llama_model * model_tgt = llama_get_model(ctx_tgt);
+
+        target_layer_ids   = llama_model_target_layer_ids  (model_dft);
+        target_layer_ids_n = llama_model_target_layer_ids_n(model_dft);
+        if (target_layer_ids_n != 3) {
+            throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " +
+                                     std::to_string(target_layer_ids_n) + ")");
+        }
+
+        n_embd_tgt = llama_model_n_embd(model_tgt);
+        n_embd_dec = llama_model_n_embd(model_dft);
+        n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt;
+
+        const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
+        batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1);
+        // llama_batch_init allocates only one of token/embd; eagle3 decoder needs both.
+        // TODO: fix, how to call without malloc
+        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b);
+
+        smpls.resize(n_seq);
+        for (auto & s : smpls) {
+            common_params_sampling sparams;
+            sparams.no_perf  = false;
+            sparams.top_k    = 10;
+            sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
+            s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
+        }
+
+        // turn on extraction of the target layers' input embeddings
+        for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
+            llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
+        }
+
+        // turn on extraction of the draft model's pre-norm hidden state
+        // (used both for the encoder output g_embd and the decoder pre-norm output).
+        llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
+
+        pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
+        pending_pos_last.assign(n_seq, -1);
+
+        verify_g.assign(n_seq, std::vector<float>());
+        verify_pos_first.assign(n_seq, -1);
+        verify_g_rows.assign(n_seq, 0);
    }

-    void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
-        // noop
+    ~common_speculative_impl_draft_eagle3() override {
+        if (batch.token != nullptr) {
+            free(batch.token);
+            batch.token = nullptr;
+        }
+        llama_batch_free(batch);
    }

-    bool process(const llama_batch & /*batch*/) override {
-        // TODO: implement
+    void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
+        const int32_t N = (int32_t) prompt.size();
+        if (N <= 0) {
+            return;
+        }
+        // expected state after prefill: ctx_dft has pos 0..N-2 (last position is deferred to
+        // draft()'s seed step). Warn only if more than one position is missing.
+        auto * ctx_dft = this->params.ctx_dft;
+        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
+        if (pos_max < N - 2) {
+            LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. "
+                    "Drafts may degrade.\n",
+                    __func__, (int) pos_max, N - 2);
+        }
+    }
+
+    bool process(const llama_batch & batch_in) override {
+        if (batch_in.n_tokens <= 0) {
+            return true;
+        }
+
+        if (batch_in.token == nullptr || batch_in.embd != nullptr) {
+            return true;
+        }
+
+        const int32_t n_tokens = batch_in.n_tokens;
+
+        // i_batch_beg[seq] / i_batch_end[seq]: inclusive batch indices of this seq's
+        // first/last token in batch_in. Assumes per-seq tokens are contiguous within
+        // the ubatch (server's default ordering).
+        std::vector<int32_t> i_batch_beg(n_seq, -1);
+        std::vector<int32_t> i_batch_end(n_seq, -1);
+        for (int k = 0; k < n_tokens; ++k) {
+            GGML_ASSERT(batch_in.n_seq_id[k] == 1);
+            const llama_seq_id seq_id = batch_in.seq_id[k][0];
+            if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+                continue;
+            }
+            i_batch_end[seq_id] = k;
+            if (i_batch_beg[seq_id] < 0) {
+                i_batch_beg[seq_id] = k;
+            }
+        }
+
+        auto * ctx_tgt = this->params.ctx_tgt;
+        auto * ctx_dft = this->params.ctx_dft;
+
+        // Interleave each extract_layer's hidden state into a contiguous buffer of
+        // shape [n_tokens, target_layer_ids_n * n_embd_tgt]. Then run EAGLE3 encoder
+        // to get one g_embd row per token.
+        features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
+
+        for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
+            const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]);
+            if (!layer) {
+                GGML_ABORT("EAGLE3: target layer %d input not extracted.", target_layer_ids[k]);
+            }
+            for (int32_t i = 0; i < n_tokens; ++i) {
+                float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt;
+                const float * src = layer + (size_t) i * n_embd_tgt;
+                std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float));
+            }
+        }
+
+        g_embd_buf.resize((size_t) n_tokens * n_embd_dec);
+
+        // llama_encode() requires the full encoder batch to fit in n_ubatch.
+        // Allow batch > ubatch: eagle3's per-token encoder can be chunked safely.
+        const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft);
+        for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) {
+            const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i);
+
+            llama_batch enc_batch = {
+                /*.n_tokens =*/ n_chunk,
+                /*.token    =*/ nullptr,
+                /*.embd     =*/ features_buf.data() + (size_t) i * n_embd_enc,
+                /*.pos      =*/ nullptr,
+                /*.n_seq_id =*/ nullptr,
+                /*.seq_id   =*/ nullptr,
+                /*.logits   =*/ nullptr,
+            };
+            const int32_t rc = llama_encode(ctx_dft, enc_batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
+                        __func__, rc, (int) n_chunk, (int) i);
+                return false;
+            }
+
+            // g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer.
+            const float * g_embd_chunk = llama_get_embeddings_nextn(ctx_dft);
+            GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output.");
+            std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec,
+                        g_embd_chunk,
+                        (size_t) n_chunk * n_embd_dec * sizeof(float));
+        }
+
+        const float * g_embd = g_embd_buf.data();
+
+        const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
+
+        // EAGLE3 decoder input convention: at memory pos P the input pair is
+        // (token[P+1], g_embd[P]). This shifts the token index "left by one" relative to g_embd.
+        //
+        // Per seq, in order:
+        //   (a) cross-ubatch bridge — when applicable, write the previously-deferred
+        //       pos using this ubatch's first token + pending_g_last.
+        //   (b) main write loop — for k in [beg, end-1], write (token[k+1], g_embd[k])
+        //       at pos[k]. The last training pos (k=end) is left unwritten = new
+        //       deferred boundary, completed by the next process() or draft() call.
+        //   (c) refresh deferred state — stash this ubatch's full g_embd into verify_g,
+        //       update pending_g_last / pending_pos_last to the last row.
+        common_batch_clear(batch);
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            const int32_t beg = i_batch_beg[seq_id];
+            const int32_t end = i_batch_end[seq_id];
+            if (beg < 0 || end < 0) {
+                continue;
+            }
+
+            // cross-ubatch bridge — complete the prior ubatch's deferred boundary.
+            // Fires iff all three preconditions hold:
+            //   1) pending_pos_last >= 0
+            //   2) pending_pos_last + 1 == pos[beg]
+            //   3) pending_pos_last > dft_pos_max // TODO: is this check needed?
+            const llama_pos pending_pos = pending_pos_last[seq_id];
+            if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) {
+                const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
+                if (pending_pos > dft_pos_max) {
+                    common_batch_add(batch, batch_in.token[beg], pending_pos, { seq_id }, /*logits=*/ false);
+                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                                pending_g_last[seq_id].data(), row_bytes);
+                }
+            }
+
+            for (int32_t k = beg; k < end; ++k) {
+                common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], { seq_id }, /*logits=*/ false);
+                std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                            g_embd + (size_t) k * n_embd_dec, row_bytes);
+            }
+
+            // refresh deferred state
+            const int32_t n_rows = end - beg + 1;
+            verify_pos_first[seq_id] = batch_in.pos[beg];
+            pending_pos_last[seq_id] = batch_in.pos[end];
+            verify_g_rows[seq_id]    = n_rows;
+            verify_g[seq_id].resize((size_t) n_rows * n_embd_dec, 0.0f);
+            std::memcpy(verify_g[seq_id].data(),       g_embd + (size_t) beg * n_embd_dec, row_bytes * n_rows);
+            std::memcpy(pending_g_last[seq_id].data(), g_embd + (size_t) end * n_embd_dec, row_bytes);
+        }
+
+        if (batch.n_tokens > 0) {
+            const int32_t rc = llama_decode(ctx_dft, batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
+                        __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
+                return false;
+            }
+        }
+
        return true;
    }

-    void draft(common_speculative_draft_params_vec & /*dparams*/) override {
-        // TODO: implement
+    void draft(common_speculative_draft_params_vec & dparams) override {
+        auto & ctx_dft = params.ctx_dft;
+
+        common_batch_clear(batch);
+
+        // keep track of which sequences are still drafting
+        int n_drafting = 0;
+        std::vector<bool> drafting(n_seq);
+
+        const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
+
+        // Complete the deferred boundary pair (dp.id_last, pending_g_last) at memory
+        // pos pending_pos_last. dp.id_last is target's freshest sample (= corrected
+        // token after verify, or first generated token after prefill), matching the
+        // EAGLE3 input convention (token[P+1], g_embd[P]) at pos P.
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+
+            if (!dp.drafting) {
+                continue;
+            }
+            if (pending_pos_last[seq_id] < 0) {
+                continue;
+            }
+
+            n_drafting++;
+            drafting[seq_id] = true;
+            common_sampler_reset(smpls[seq_id].get());
+
+            llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, pending_pos_last[seq_id], -1);
+
+            common_batch_add(batch, dp.id_last, pending_pos_last[seq_id], { seq_id }, true);
+            std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                        pending_g_last[seq_id].data(),
+                        row_bytes);
+        }
+
+        if (batch.n_tokens == 0) {
+            return;
+        }
+
+        int ret = llama_decode(ctx_dft, batch);
+        if (ret != 0) {
+            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
+            return;
+        }
+
+        int i = 0;
+
+        while (n_drafting > 0) {
+            int i_batch = 0;
+
+            common_batch_clear(batch);
+
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                if (!drafting[seq_id]) {
+                    continue;
+                }
+
+                auto * smpl = smpls[seq_id].get();
+
+                common_sampler_sample(smpl, ctx_dft, i_batch, true);
+                // pre-norm hidden state of this position becomes g_embd for the next step
+                const float * prenorm = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
+                ++i_batch;
+
+                const auto * cur_p = common_sampler_get_candidates(smpl, true);
+
+                for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
+                    LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                            seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
+                            common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
+                }
+
+                const llama_token id = cur_p->data[0].id;
+
+                // only collect very high-confidence draft tokens
+                // (configurable via --spec-draft-p-min, set to 0.0 to disable early-stop)
+                if (cur_p->data[0].p < params.p_min) {
+                    drafting[seq_id] = false;
+                    n_drafting--;
+
+                    continue;
+                }
+
+                common_sampler_accept(smpl, id, true);
+
+                auto & dp = dparams.at(seq_id);
+                auto & result = *dp.result;
+
+                result.push_back(id);
+
+                if (params.n_max <= (int) result.size()) {
+                    drafting[seq_id] = false;
+                    n_drafting--;
+                    continue;
+                }
+
+                common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true);
+                std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, prenorm, row_bytes);
+            }
+
+            if (batch.n_tokens == 0) {
+                break;
+            }
+
+            ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                break;
+            }
+
+            ++i;
+        }
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+            if (!dp.drafting) {
+                continue;
+            }
+
+            if (dp.result->size() < (size_t) params.n_min) {
+                dp.result->clear();
+            }
+        }
    }

-    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
-        // noop
+    void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+            return;
+        }
+
+        const int32_t n_rows = verify_g_rows[seq_id];
+        if (n_rows <= 0) {
+            return;
+        }
+
+        const int32_t i_g = std::min<int32_t>(n_accepted, n_rows - 1);
+        pending_pos_last[seq_id] = verify_pos_first[seq_id] + i_g;
+        std::memcpy(pending_g_last[seq_id].data(),
+                    verify_g[seq_id].data() + (size_t) i_g * n_embd_dec,
+                    (size_t) n_embd_dec * sizeof(float));
    }

    bool need_embd() const override {
@@ -1370,9 +1776,11 @@ common_speculative * common_speculative_init(common_params_speculative & params,
        uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);

        bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
-        bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
+        bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
        bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;

+
+
        bool has_ngram_cache   = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE));
        bool has_ngram_simple  = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE));
        bool has_ngram_map_k   = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K));
@@ -130,6 +130,9 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LlamaBidirectionalModel": "llama",
    "LlamaForCausalLM": "llama",
    "LlamaModel": "llama",
+    "Eagle3DraftModel": "llama",
+    "Eagle3Speculator": "llama",
+    "LlamaForCausalLMEagle3": "llama",
    "LlavaForConditionalGeneration": "llama",
    "LlavaStableLMEpochForCausalLM": "stablelm",
    "MPTForCausalLM": "mpt",
@@ -94,6 +94,7 @@ class ModelBase:
    metadata: gguf.Metadata
    dir_model_card: Path
    remote_hf_model_id: str | None
+    target_model_dir: Path | None

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@@ -119,6 +120,7 @@ class ModelBase:
                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
                 disable_mistral_community_chat_template: bool = False,
                 sentence_transformers_dense_modules: bool = False,
+                 target_model_dir: Path | None = None,
                 fuse_gate_up_exps: bool = False,
                 fp8_as_q8: bool = False):
        if type(self) is ModelBase or \
@@ -139,6 +141,7 @@ class ModelBase:
        self.dry_run = dry_run
        self.remote_hf_model_id = remote_hf_model_id
        self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
+        self.target_model_dir = target_model_dir
        self.fuse_gate_up_exps = fuse_gate_up_exps
        self._gate_exp_buffer: dict[int, Tensor] = {}
        self._up_exp_buffer: dict[int, Tensor] = {}
@@ -2481,6 +2484,7 @@ class LazyTorchTensor(gguf.LazyBase):
        torch.float16: np.float16,
        torch.float32: np.float32,
        torch.uint8: np.uint8,
+        torch.int64: np.int64,
    }

    # only used when byteswapping data. Only correct size is needed
@@ -5,12 +5,13 @@ import math

 from typing import Callable, Iterable, TYPE_CHECKING

+import numpy as np
 import torch

 if TYPE_CHECKING:
    from torch import Tensor

-from .base import ModelBase, TextModel, gguf
+from .base import ModelBase, TextModel, gguf, logger


@ModelBase.register(
@@ -21,6 +22,9 @@ from .base import ModelBase, TextModel, gguf
    "VLlama3ForCausalLM",
    "LlavaForConditionalGeneration",
    "VoxtralForConditionalGeneration",
+    "LlamaForCausalLMEagle3",
+    "Eagle3Speculator",
+    "Eagle3DraftModel",
    "IQuestCoderForCausalLM",
    "LlamaModel")
 class LlamaModel(TextModel):
@@ -39,7 +43,61 @@ class LlamaModel(TextModel):
            hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
            self.origin_hf_arch = hparams.get('architectures', [None])[0]

+        # Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
+        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
+            self.is_eagle3 = True
+            self.model_arch = gguf.MODEL_ARCH.EAGLE3
+            logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
+            # Re-initialize tensor_map with eagle3 architecture
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+            # Update gguf_writer architecture
+            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
+            self.gguf_writer.add_architecture()
+            if self.target_model_dir is None:
+                raise ValueError(
+                    "EAGLE-3 model requires --target-model-dir to be specified. "
+                    "Please provide the path to the target model directory to read config.json"
+                )
+            # Read both eagle3 raw config and target model config
+            with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
+                eagle3_raw_config = json.load(f)
+            with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
+                target_config = json.load(f)
+
+            if "text_config" in target_config:
+                target_config = {**target_config, **target_config["text_config"]}
+            self.target_vocab_size = target_config["vocab_size"]
+
+            # target_layers: derived from target model layer count (low/mid/high)
+            target_num_layers = target_config["num_hidden_layers"]
+            target_layers = [2, target_num_layers // 2, target_num_layers - 3]
+            logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
+            self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)
+
+            # target_hidden_size: prefer eagle3 config, fallback to target config
+            if eagle3_raw_config.get("target_hidden_size") is not None:
+                target_hidden_size = eagle3_raw_config["target_hidden_size"]
+                src = "EAGLE-3 config"
+            else:
+                target_hidden_size = target_config["hidden_size"]
+                src = "target model config"
+            logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
+            self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
+
+            # norm_before_residual (RedHat-style eagle3 specific)
+            norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
+            logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
+            self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
+
    def set_vocab(self):
+        # eagle3: use tokenizer from target model if provided
+        original_dir_model = None
+        if getattr(self, 'is_eagle3', False):
+            assert self.target_model_dir is not None
+            logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
+            original_dir_model = self.dir_model
+            self.dir_model = self.target_model_dir
+
        if self.origin_hf_arch == "GlmasrModel":
            return self._set_vocab_glmedge()

@@ -85,6 +143,10 @@ class LlamaModel(TextModel):
        if self.hparams.get("vocab_size", 32000) == 49152:
            self.gguf_writer.add_add_bos_token(False)

+        # eagle3: Restore original dir_model
+        if original_dir_model is not None:
+            self.dir_model = original_dir_model
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
@@ -129,7 +191,49 @@ class LlamaModel(TextModel):

        return super().filter_tensors((name, gen))

+    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
+        tensors = super().index_tensors(remote_hf_model_id)
+
+        # Handle Eagle3Speculator nested config
+        if "transformer_layer_config" in self.hparams:
+            self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
+
+        # eagle3 detection
+        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
+            logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
+            new_tensors = {}
+            for name, gen in tensors.items():
+                if name.startswith("midlayer."):
+                    new_name = "model.layers.0." + name[len("midlayer."):]
+                    new_tensors[new_name] = gen
+                elif name.startswith("layers.0."):  # Eagle3Speculator format
+                    new_name = "model." + name
+                    new_tensors[new_name] = gen
+                else:
+                    new_tensors[name] = gen
+            return new_tensors
+
+        return tensors
+
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # eagle3: special tensors that bypass standard llama mapping
+        if getattr(self, 'is_eagle3', False):
+            if name == "fc.weight":
+                yield (name, data_torch)
+                return
+            if name == "d2t":
+                # store for manual int64 handling in prepare_tensors (avoid F32 conversion)
+                if not hasattr(self, '_eagle3_int_tensors'):
+                    self._eagle3_int_tensors = {}
+                self._eagle3_int_tensors[name] = data_torch
+                return
+            if name == "t2d":
+                # not used at runtime, skip
+                return
+            if name.endswith(".hidden_norm.weight"):
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
+                return
+
        n_head = self.find_hparam(["n_heads", "num_attention_heads"])
        n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])

@@ -205,8 +309,33 @@ class LlamaModel(TextModel):
                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))

    def prepare_tensors(self):
+        # eagle3: collect d2t original dtype before parent converts tensors to F32
+        eagle3_original_dtypes = {}
+        if getattr(self, 'is_eagle3', False):
+            for name, data_torch in self.get_tensors():
+                if name == "d2t":
+                    eagle3_original_dtypes[name] = data_torch.dtype
+
        super().prepare_tensors()

+        # eagle3: write d2t as absolute target token ids
+        if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
+            for name, data_torch in self._eagle3_int_tensors.items():
+                old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
+                data = data_torch.to(torch.int64).cpu().numpy()
+                if name == "d2t":
+                    data = data.reshape(-1)
+                    data = data + np.arange(data.size, dtype=np.int64)
+                    if np.any((data < 0) | (data >= self.target_vocab_size)):
+                        raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
+                    if np.unique(data).size != data.size:
+                        raise ValueError("EAGLE-3 d2t contains duplicate target ids")
+                data_qtype = gguf.GGMLQuantizationType.I64
+
+                shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
+                logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
+                self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
+
        if self._experts is not None:
            # flatten `list[dict[str, Tensor]]` into `list[str]`
            experts = [k for d in self._experts for k in d.keys()]
@@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace:
        help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
    )

+    parser.add_argument(
+        "--target-model-dir", type=str, default=None,
+        help=(
+            "path to the target model directory; required when converting a standalone draft model "
+            "(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
+            "layer count to populate its GGUF."
+        ),
+    )
+
    args = parser.parse_args()
    if not args.print_supported_models and args.model is None:
        parser.error("the following arguments are required: model")
@@ -269,6 +278,7 @@ def main() -> None:
                                     small_first_shard=args.no_tensor_first_split,
                                     remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
+                                     target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
                                     fuse_gate_up_exps=args.fuse_gate_up_exps,
                                     fp8_as_q8=args.fp8_as_q8,
                                     )
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 15)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -7741,6 +7741,23 @@ static void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, si
    if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
        GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);

+        std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
+        vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool);
+        ggml_vk_ctx_begin(src->device, subctx);
+        subctx->s->buffer->buf.pipelineBarrier(
+            vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer,
+            vk::PipelineStageFlagBits::eHost,
+            {},
+            { { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite,
+                vk::AccessFlagBits::eHostRead } },
+            {}, {});
+        ggml_vk_ctx_end(subctx);
+        ggml_vk_submit(subctx, src->device->fence);
+        VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX),
+                 "vk_buffer_read_2d uma waitForFences");
+        src->device->device.resetFences({ src->device->fence });
+        ggml_vk_queue_command_pools_cleanup(src->device);
+
        if (width == spitch && width == dpitch) {
            memcpy(dst, (const uint8_t *) src->ptr + offset, width * height);
        } else {
@@ -154,6 +154,9 @@ class Keys:
        HIDDEN_ACT                        = "{arch}.hidden_activation"
        DENSE_FEAT_IN_SIZE                = "{arch}.{dense}_feat_in"
        DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
+        TARGET_LAYERS                     = "{arch}.target_layers"
+        TARGET_HIDDEN_SIZE                = "{arch}.target_hidden_size"
+        NORM_BEFORE_RESIDUAL              = "{arch}.norm_before_residual"

    class Attention:
        HEAD_COUNT                   = "{arch}.attention.head_count"
@@ -511,6 +514,7 @@ class MODEL_ARCH(IntEnum):
    RND1             = auto()
    PANGU_EMBED      = auto()
    MISTRAL3         = auto()
+    EAGLE3           = auto()
    MISTRAL4         = auto()
    PADDLEOCR        = auto()
    MIMO2            = auto()
@@ -901,14 +905,17 @@ class MODEL_TENSOR(IntEnum):
    A_PER_DIM_K_SCALE     = auto() # gemma4
    A_PER_DIM_SCALE       = auto() # gemma4
    # nextn/mtp
-    NEXTN_PROJ_PRE       = auto()
-    NEXTN_PROJ_POST      = auto()
-    NEXTN_EH_PROJ        = auto()
-    NEXTN_EMBED_TOKENS   = auto()
-    NEXTN_ENORM          = auto()
-    NEXTN_HNORM          = auto()
+    NEXTN_PROJ_PRE         = auto()
+    NEXTN_PROJ_POST        = auto()
+    NEXTN_EH_PROJ          = auto()
+    NEXTN_EMBED_TOKENS     = auto()
+    NEXTN_ENORM            = auto()
+    NEXTN_HNORM            = auto()
    NEXTN_SHARED_HEAD_HEAD = auto()
    NEXTN_SHARED_HEAD_NORM = auto()
+    # eagle3
+    FC                     = auto()  # feature fusion layer
+    D2T                    = auto()  # draft to target vocabulary mapping
    # lfm2 audio
    A_ENC_NORM_CONV        = auto()
    A_ENC_LINEAR_POS       = auto()
@@ -1063,6 +1070,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.RND1:             "rnd1",
    MODEL_ARCH.PANGU_EMBED:      "pangu-embedded",
    MODEL_ARCH.MISTRAL3:         "mistral3",
+    MODEL_ARCH.EAGLE3:           "eagle3",
    MODEL_ARCH.MISTRAL4:         "mistral4",
    MODEL_ARCH.PADDLEOCR:        "paddleocr",
    MODEL_ARCH.MIMO2:            "mimo2",
@@ -1095,8 +1103,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.POS_EMBD:                  "position_embd",
    MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
    MODEL_TENSOR.OUTPUT:                    "output",
-    MODEL_TENSOR.DENSE_2_OUT:                "dense_2", # embeddinggemma 2_Dense
-    MODEL_TENSOR.DENSE_3_OUT:                "dense_3", # embeddinggemma 2_Dense
+    MODEL_TENSOR.DENSE_2_OUT:               "dense_2", # embeddinggemma 2_Dense
+    MODEL_TENSOR.DENSE_3_OUT:               "dense_3", # embeddinggemma 2_Dense
    MODEL_TENSOR.ROPE_FREQS:                "rope_freqs",
    MODEL_TENSOR.ROPE_FACTORS_LONG:         "rope_factors_long",
    MODEL_TENSOR.ROPE_FACTORS_SHORT:        "rope_factors_short",
@@ -1488,6 +1496,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.NEXTN_HNORM:               "blk.{bid}.nextn.hnorm",
    MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD:    "blk.{bid}.nextn.shared_head_head",
    MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM:    "blk.{bid}.nextn.shared_head_norm",
+    MODEL_TENSOR.FC:                        "fc",
+    MODEL_TENSOR.D2T:                       "d2t",
 }

 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -4028,6 +4038,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
+    MODEL_ARCH.EAGLE3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_NORM_2,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FC,
+        MODEL_TENSOR.D2T,
+    ],
    MODEL_ARCH.MISTRAL4: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@@ -1 +1 @@
-a5ce761c70415ebb9066a76d1efd3b938047e21e
+3af5f5760e19a96427f5f7a93b79cbdf3d4b265b
@@ -5,7 +5,7 @@ import os
 import sys
 import subprocess

-HTTPLIB_VERSION = "refs/tags/v0.46.1"
+HTTPLIB_VERSION = "refs/tags/v0.47.0"

 vendor = {
    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
@@ -16,11 +16,80 @@ set(HF_ENABLED        "" CACHE STRING "Whether to allow HF Bucket download (ON/O
 set(BUILD_UI          "" CACHE STRING "Build UI via npm (ON/OFF)")
 set(LLAMA_UI_EMBED    "" CACHE STRING "Path to llama-ui-embed helper")

+# IMPORTANT: When adding PWA assets, sync across all 3 places:
+#   1. tools/ui/src/lib/constants/pwa.ts   (APPLE_DEVICES, PUBLIC_ENDPOINTS)
+#   2. tools/server/server-http.cpp        (public_endpoints)
+#   3. scripts/ui-assets.cmake             (ASSETS list)
+# - C++ (server-http.cpp) - public endpoints (splash screens generated via helper)
+# - TypeScript (constants/pwa.ts) - APPLE_DEVICES, PWA_MANIFEST, PUBLIC_ENDPOINTS
+#
+# When adding/changing PWA assets, update tools/ui/src/lib/constants/pwa.ts first,
+# then sync any new file names here and in server-http.cpp.
 set(ASSETS
-    bundle.css
-    bundle.js
    index.html
    loading.html
+    # PWA assets
+    favicon.ico
+    favicon-dark.ico
+    favicon.svg
+    favicon-dark.svg
+    pwa-64x64.png
+    pwa-192x192.png
+    pwa-512x512.png
+    maskable-icon-512x512.png
+    apple-touch-icon-180x180.png
+    # iOS splash screens
+    apple-splash-portrait-640x1136.png
+    apple-splash-landscape-1136x640.png
+    apple-splash-portrait-750x1334.png
+    apple-splash-landscape-1334x750.png
+    apple-splash-portrait-1170x2532.png
+    apple-splash-landscape-2532x1170.png
+    apple-splash-portrait-1179x2556.png
+    apple-splash-landscape-2556x1179.png
+    apple-splash-portrait-1206x2622.png
+    apple-splash-landscape-2622x1206.png
+    apple-splash-portrait-1284x2778.png
+    apple-splash-landscape-2778x1284.png
+    apple-splash-portrait-1290x2796.png
+    apple-splash-landscape-2796x1290.png
+    apple-splash-portrait-1320x2868.png
+    apple-splash-landscape-2868x1320.png
+    apple-splash-portrait-1488x2266.png
+    apple-splash-landscape-2266x1488.png
+    apple-splash-portrait-1640x2360.png
+    apple-splash-landscape-2360x1640.png
+    apple-splash-portrait-1668x2388.png
+    apple-splash-landscape-2388x1668.png
+    apple-splash-portrait-2048x2732.png
+    apple-splash-landscape-2732x2048.png
+    # iOS dark splash screens
+    apple-splash-portrait-dark-640x1136.png
+    apple-splash-landscape-dark-1136x640.png
+    apple-splash-portrait-dark-750x1334.png
+    apple-splash-landscape-dark-1334x750.png
+    apple-splash-portrait-dark-1170x2532.png
+    apple-splash-landscape-dark-2532x1170.png
+    apple-splash-portrait-dark-1179x2556.png
+    apple-splash-landscape-dark-2556x1179.png
+    apple-splash-portrait-dark-1206x2622.png
+    apple-splash-landscape-dark-2622x1206.png
+    apple-splash-portrait-dark-1284x2778.png
+    apple-splash-landscape-dark-2778x1284.png
+    apple-splash-portrait-dark-1290x2796.png
+    apple-splash-landscape-dark-2796x1290.png
+    apple-splash-portrait-dark-1320x2868.png
+    apple-splash-landscape-dark-2868x1320.png
+    apple-splash-portrait-dark-1640x2360.png
+    apple-splash-landscape-dark-2360x1640.png
+    apple-splash-portrait-dark-1668x2388.png
+    apple-splash-landscape-dark-2388x1668.png
+    apple-splash-portrait-dark-2048x2732.png
+    apple-splash-landscape-dark-2732x2048.png
+    manifest.webmanifest
+    sw.js
+    _app/version.json
+    build.json
 )

 set(DIST_DIR     "${UI_BINARY_DIR}/dist")
@@ -159,7 +228,7 @@ function(npm_build out_var)

    message(STATUS "UI: running npm run build, output -> ${DIST_DIR}")
    execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}"
+        COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}" "LLAMA_UI_VERSION=${HF_VERSION}" "LLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}"
                ${NPM_EXECUTABLE} run build
        WORKING_DIRECTORY "${UI_SOURCE_DIR}"
        RESULT_VARIABLE rc
@@ -274,8 +343,35 @@ function(emit_files)
        foreach(asset ${ASSETS})
            list(APPEND args "${asset}" "${DIST_DIR}/${asset}")
        endforeach()
+
+        # Bundle files live in _app/immutable/ — vanilla SvelteKit output, no plugin
+        # rewriting. Embedded names must match the exact _app/ paths that index.html
+        # and sw.js reference.
+        file(GLOB_RECURSE detected_bundle_js "${DIST_DIR}/_app/immutable/bundle.*.js")
+        file(GLOB_RECURSE detected_bundle_css "${DIST_DIR}/_app/immutable/assets/bundle.*.css")
+        file(GLOB_RECURSE detected_workbox "${DIST_DIR}/workbox-*.js")
+        # Compute relative path from DIST_DIR to each found file.
+        # e.g. /path/to/build/tools/ui/dist/_app/immutable/bundle.XXX.js
+        #      -> _app/immutable/bundle.XXX.js
+        foreach(f ${detected_bundle_js})
+            string(REPLACE "${DIST_DIR}/" "" rel "${f}")
+            list(APPEND args "${rel}" "${f}")
+        endforeach()
+        foreach(f ${detected_bundle_css})
+            string(REPLACE "${DIST_DIR}/" "" rel "${f}")
+            list(APPEND args "${rel}" "${f}")
+        endforeach()
+        foreach(f ${detected_workbox})
+            string(REPLACE "${DIST_DIR}/" "" rel "${f}")
+            list(APPEND args "${rel}" "${f}")
+        endforeach()
    endif()

+    # Create build.json with the llama.cpp build number for UI version display.
+    # This is separate from SvelteKit's _app/version.json (used for SW cache invalidation).
+    # build.json is generated by the vite plugin (buildInfoPlugin) during npm build.
+    # CMake just embeds it from the dist that npm produced.
+
    execute_process(
        COMMAND "${LLAMA_UI_EMBED}" ${args}
        RESULT_VARIABLE rc
@@ -300,6 +396,8 @@ endif()
 set(provisioned FALSE)

 if(BUILD_UI)
+    # Resolve version from git build-info if not explicitly set
+    resolve_version(HF_VERSION)
    npm_build(NPM_OK)
    if(NPM_OK)
        set(provisioned TRUE)
@@ -3,7 +3,6 @@
 #include "llama-impl.h"

 #include <map>
-#include <set>
 #include <vector>

 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -128,6 +127,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_RND1,             "rnd1"             },
    { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
    { LLM_ARCH_MISTRAL3,         "mistral3"         },
+    { LLM_ARCH_EAGLE3,           "eagle3"           },
    { LLM_ARCH_MISTRAL4,         "mistral4"         },
    { LLM_ARCH_PADDLEOCR,        "paddleocr"        },
    { LLM_ARCH_MIMO2,            "mimo2"            },
@@ -292,12 +292,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

+    { LLM_KV_TARGET_LAYERS,         "%s.target_layers"        },
+    { LLM_KV_TARGET_HIDDEN_SIZE,    "%s.target_hidden_size"   },
+    { LLM_KV_NORM_BEFORE_RESIDUAL,  "%s.norm_before_residual" },
+
    { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
    // sentence-transformers dense modules feature dims
    { LLM_KV_DENSE_2_FEAT_IN,        "%s.dense_2_feat_in"  },
-    { LLM_KV_DENSE_2_FEAT_OUT,       "%s.dense_2_feat_out"  },
-    { LLM_KV_DENSE_3_FEAT_IN,        "%s.dense_3_feat_in"   },
-    { LLM_KV_DENSE_3_FEAT_OUT,       "%s.dense_3_feat_out"  },
+    { LLM_KV_DENSE_2_FEAT_OUT,       "%s.dense_2_feat_out" },
+    { LLM_KV_DENSE_3_FEAT_IN,        "%s.dense_3_feat_in"  },
+    { LLM_KV_DENSE_3_FEAT_OUT,       "%s.dense_3_feat_out" },

    { LLM_KV_TOKENIZER_MODEL,                    "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                      "tokenizer.ggml.pre"                      },
@@ -562,6 +566,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
    { LLM_TENSOR_INDEXER_ATTN_Q_B,                       "blk.%d.indexer.attn_q_b" },
    { LLM_TENSOR_MASKED_EMBD_CENTROIDS,                  "masked_embd_centroids" },
    { LLM_TENSOR_MASKED_EMBD_ORDERING,                   "masked_embd_ordering" },
+    { LLM_TENSOR_FC,                                     "fc" },
+    { LLM_TENSOR_D2T,                                    "d2t" },
 };

 // declare information about the model weight tensors:
@@ -788,6 +794,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_FFN_LATENT_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_MASKED_EMBD_CENTROIDS,      {LLM_TENSOR_LAYER_INPUT,     GGML_OP_NONE}},
    {LLM_TENSOR_MASKED_EMBD_ORDERING,       {LLM_TENSOR_LAYER_INPUT,     GGML_OP_NONE}},
+    // eagle3
+    {LLM_TENSOR_FC,                         {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_D2T,                        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
 };

 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -141,6 +141,7 @@ enum llm_arch {
    LLM_ARCH_KIMI_LINEAR,
    LLM_ARCH_TALKIE,
    LLM_ARCH_MELLUM,
+    LLM_ARCH_EAGLE3,
    LLM_ARCH_UNKNOWN,
 };

@@ -337,6 +338,10 @@ enum llm_kv {

    LLM_KV_CLASSIFIER_OUTPUT_LABELS,

+    LLM_KV_TARGET_LAYERS,
+    LLM_KV_TARGET_HIDDEN_SIZE,
+    LLM_KV_NORM_BEFORE_RESIDUAL,
+
    LLM_KV_SHORTCONV_L_CACHE,

    LLM_KV_XIELU_ALPHA_N,
@@ -569,6 +574,8 @@ enum llm_tensor {
    LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
    LLM_TENSOR_MASKED_EMBD_CENTROIDS,
    LLM_TENSOR_MASKED_EMBD_ORDERING,
+    LLM_TENSOR_FC,
+    LLM_TENSOR_D2T,
 };


@@ -71,6 +71,9 @@ llama_context::llama_context(
    cparams.no_perf                 = params.no_perf;
    cparams.warmup                  = false;

+    cparams.embeddings_layer_inp.resize(hparams.n_layer(), false);
+    embd_layer_inp.resize(hparams.n_layer());
+
    cparams.ctx_type     = params.ctx_type;
    cparams.pooling_type = params.pooling_type;

@@ -91,12 +94,21 @@ llama_context::llama_context(
    if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
        if (params.ctx_other == nullptr) {
            // TODO: change from runtime_error to llama_exception to avoid printing error message
-            throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)");
+            throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this warning is normal during memory fitting)");
        }

        cparams.ctx_other = params.ctx_other;
    }

+    if (model.arch == LLM_ARCH_EAGLE3) {
+        if (model.tok_embd == nullptr || model.output == nullptr) {
+            if (params.ctx_other == nullptr) {
+                throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)");
+            }
+            cparams.ctx_other = params.ctx_other;
+        }
+    }
+
    // Initialize backend samplers here so they are part of the sampling graph
    // before the reserve passes run later in this function. This avoids a later
    // re-reserve when graph nodes change.
@@ -194,7 +206,7 @@ llama_context::llama_context(

    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

-    cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max;
+    cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max;

    cparams.op_offload = params.op_offload;
    cparams.kv_unified = params.kv_unified;
@@ -938,6 +950,14 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) {
    }
 }

+float * llama_context::get_embeddings_layer_inp(uint32_t lid) {
+    output_reorder();
+
+    GGML_ASSERT(lid < embd_layer_inp.size() && embd_layer_inp[lid].has_data());
+
+    return embd_layer_inp[lid].data;
+}
+
 llama_token llama_context::get_sampled_token_ith(int32_t idx) {
    output_reorder();

@@ -1125,6 +1145,17 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) {
    cparams.embeddings_nextn_masked = masked;
 }

+void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
+    LLAMA_LOG_DEBUG("%s: lid = %d, enable = %d\n", __func__, lid, enable);
+
+    GGML_ASSERT(lid < model.hparams.n_layer());
+
+    cparams.embeddings_layer_inp[lid] = enable;
+
+    // note: without this reserve, the draft acceptance drops to zero. not sure why - this is unexpected
+    sched_need_reserve = true;
+}
+
 void llama_context::set_causal_attn(bool value) {
    LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);

@@ -1350,7 +1381,8 @@ int llama_context::encode(const llama_batch & batch_inp) {

    const auto & hparams = model.hparams;

-    const int64_t n_embd  = hparams.n_embd_inp();
+    // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
+    const int64_t n_embd = hparams.n_embd_inp();
    const int64_t n_vocab = model.vocab.n_tokens();

    // note: during encode, we always pass the full sequence starting from pos = 0
@@ -1925,6 +1957,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
            }
        }

+        extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens);
+
        // extract nextn embeddings before
        // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
        {
@@ -2029,6 +2063,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {

    const auto n_batch    = cparams.n_batch;
    const auto n_vocab    = vocab.n_tokens();
+    const auto n_embd     = hparams.n_embd;
    const auto n_embd_out = hparams.n_embd_out();

    bool has_logits     = true;
@@ -2041,9 +2076,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
        has_embd   = true;
    }

-
    size_t backend_float_count = 0;
    size_t backend_token_count = 0;
+    size_t embd_layer_inp_float_count = 0;

    logits.size     = has_logits     ? n_vocab*n_outputs_max     : 0;
    embd.size       = has_embd       ? n_embd_out*n_outputs_max  : 0;
@@ -2055,6 +2090,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
        embd_nextn.size = (size_t) n_embd_out * n_batch;
    }

+    for (bool enabled : cparams.embeddings_layer_inp) {
+        if (enabled) {
+            embd_layer_inp_float_count += (size_t) n_embd * n_batch;
+        }
+    }
+
    // Allocate backend sampling output buffers if there are backend samplers configured.
    const bool has_sampling = !sampling.samplers.empty();
    if (has_sampling) {
@@ -2069,8 +2110,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {

    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
    const size_t new_size  =
-        (logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) +
-        (                                               backend_token_count) * sizeof(llama_token);
+        (logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) +
+        (                                                                         backend_token_count) * sizeof(llama_token);

    // alloc only when more than the current capacity is required
    // TODO: also consider shrinking the buffer
@@ -2087,6 +2128,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
            logits.data = nullptr;
            embd.data = nullptr;
            embd_nextn.data = nullptr;
+            for (auto & layer_inp : embd_layer_inp) {
+                layer_inp = {nullptr, 0};
+            }
        }

        auto * buft = ggml_backend_cpu_buffer_type();
@@ -2118,6 +2162,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
    embd_nextn = has_embd_nextn ? buffer_view<float>{(float *) (base + offset), embd_nextn.size} : buffer_view<float>{nullptr, 0};
    offset += embd_nextn.size * sizeof(float);

+    for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) {
+        if (cparams.embeddings_layer_inp[il]) {
+            embd_layer_inp[il] = buffer_view<float>{(float *) (base + offset), (size_t) n_embd * n_batch};
+            offset += embd_layer_inp[il].size * sizeof(float);
+        } else {
+            embd_layer_inp[il] = buffer_view<float>{nullptr, 0};
+        }
+    }
+
    if (has_sampling) {
        sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
        offset += sampling.logits.size * sizeof(float);
@@ -2164,6 +2217,34 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
    return n_outputs_max;
 }

+void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) {
+    for (uint32_t il = 0; il < cparams.embeddings_layer_inp.size(); ++il) {
+        if (!cparams.embeddings_layer_inp[il]) {
+            continue;
+        }
+        if (!embd_layer_inp[il].has_data()) {
+            GGML_ABORT("output layer input buffer not allocated");
+        }
+        ggml_tensor * t = res->get_layer_inp((int) il);
+        if (!t) {
+            GGML_ABORT("layer input tensor not found");
+        }
+
+        const size_t nbytes = ggml_nbytes(t);
+        const size_t nfloats = nbytes / sizeof(float);
+        GGML_ASSERT(n_tokens > 0);
+        GGML_ASSERT(nfloats % n_tokens == 0);
+
+        const size_t row_floats = nfloats / n_tokens;
+        const size_t dst_offset = token_offset * row_floats;
+        GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size);
+
+        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t);
+        GGML_ASSERT(backend != nullptr);
+        ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes);
+    }
+}
+
 void llama_context::output_reorder() {
    const uint64_t n_vocab = model.vocab.n_tokens();
    const uint64_t n_embd  = model.hparams.n_embd;
@@ -2190,6 +2271,16 @@ void llama_context::output_reorder() {
            }
        }

+        if (embd_layer_inp.size() > 0) {
+            for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) {
+                if (embd_layer_inp[lid].size > 0) {
+                    for (uint64_t k = 0; k < n_embd; ++k) {
+                        std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]);
+                    }
+                }
+            }
+        }
+
        if (!sampling.samplers.empty()) {
            assert(sampling.logits.size > 0);
            assert(sampling.probs.size > 0);
@@ -3604,6 +3695,10 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) {
    ctx->set_embeddings_nextn(value, masked);
 }

+void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool value) {
+    ctx->set_embeddings_layer_inp(lid, value);
+}
+
 llama_memory_t llama_get_memory(const struct llama_context * ctx) {
    if (!ctx) {
        return nullptr;
@@ -3624,6 +3719,12 @@ float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) {
    return ctx->get_embeddings_nextn_ith(i);
 }

+float * llama_get_embeddings_layer_inp(llama_context * ctx, uint32_t lid) {
+    ctx->synchronize();
+
+    return ctx->get_embeddings_layer_inp(lid);
+}
+
 bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
    return ctx->set_sampler(seq_id, smpl);
 }
@@ -88,6 +88,8 @@ struct llama_context {
    float * get_embeddings_nextn();
    float * get_embeddings_nextn_ith(int32_t i);

+    float * get_embeddings_layer_inp(uint32_t lid);
+
    llama_token * get_sampled_tokens() const;
    llama_token   get_sampled_token_ith(int32_t idx);

@@ -112,6 +114,7 @@ struct llama_context {

    void set_embeddings (bool value);
    void set_embeddings_nextn(bool value, bool masked);
+    void set_embeddings_layer_inp(uint32_t lid, bool enable);
    void set_causal_attn(bool value);
    void set_warmup(bool value);

@@ -226,6 +229,10 @@ private:
    // map the output row index `i` to batch index
    int64_t output_resolve_row(int32_t i) const;

+    // async-copy enabled layer-input tensors (per cparams.output_layer_inp)
+    // from backend into host-side embd_layer_inp buffers
+    void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens);
+
    //
    // graph
    //
@@ -288,6 +295,10 @@ private:
    // sets llm_graph_result::t_h_nextn
    buffer_view<float> embd_nextn = {nullptr, 0};

+    // host buffers for output layer input embeddings, per layer
+    // populated when cparams.output_layer_inp[il] is true
+    std::vector<buffer_view<float>> embd_layer_inp;
+
    struct sampling_info {
        // !samplers.empty() to check if any samplers are active
        std::map<llama_seq_id, llama_sampler *> samplers;
@@ -3,6 +3,7 @@
 #include "llama.h"

 #include <cstdint>
+#include <vector>

 #define LLAMA_MAX_SEQ 256

@@ -44,6 +45,8 @@ struct llama_cparams {
    bool kv_unified;
    bool pipeline_parallel;

+    std::vector<bool> embeddings_layer_inp; // [n_layer()] extract input embeddings for layer
+
    enum llama_context_type ctx_type;
    enum llama_pooling_type pooling_type;

@@ -2,6 +2,7 @@

 // this is a staging header for new llama.cpp API
 // breaking changes and C++ are allowed. everything here should be considered WIP
+// try as much as possible to not include this header in the rest of the codebase

 #include "llama.h"

@@ -101,4 +102,20 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
 // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
 LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i);

+// Set whether the context outputs the input embeddings of a specific layer
+LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value);
+
+// mirrors:
+// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid);
+
 LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
+
+//
+// model/context data extraction
+//
+
+// returns pointer to the target-model layer indices
+LLAMA_API const int32_t * llama_model_target_layer_ids  (const struct llama_model * model);
+// returns the number of extracted layers from target model
+LLAMA_API uint32_t        llama_model_target_layer_ids_n(const struct llama_model * model);
@@ -904,6 +904,10 @@ void llm_graph_result::reset() {
    t_logits      = nullptr;
    t_embd        = nullptr;
    t_embd_pooled = nullptr;
+
+    t_layer_inp.resize(LLAMA_MAX_LAYERS);
+    std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr);
+
    t_sampled.clear();
    t_sampled_probs.clear();
    t_sampled_logits.clear();
@@ -932,7 +936,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
    }
 }

-void llm_graph_result::set_outputs() {
+void llm_graph_result::set_outputs(const llm_graph_params & params) {
    if (t_logits != nullptr) {
        ggml_set_output(t_logits);
    }
@@ -945,6 +949,15 @@ void llm_graph_result::set_outputs() {
    if (t_h_nextn != nullptr) {
        ggml_set_output(t_h_nextn);
    }
+    {
+        const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp;
+        for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) {
+            if (embeddings_layer_inp[il]) {
+                GGML_ASSERT(t_layer_inp[il] != nullptr && "layer input tensor is null");
+                ggml_set_output(t_layer_inp[il]);
+            }
+        }
+    }
    for (auto & [seq_id, t] : t_sampled) {
        if (t != nullptr) {
            ggml_set_output(t);
@@ -705,6 +705,8 @@ public:
    ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
    ggml_tensor * get_h_nextn()     const { return t_h_nextn; }

+    ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
+
    ggml_cgraph  * get_gf()  const { return gf; }
    ggml_context * get_ctx() const { return ctx_compute.get(); }

@@ -713,7 +715,7 @@ public:
    void reset();

    void set_inputs(const llama_ubatch * ubatch);
-    void set_outputs();
+    void set_outputs(const llm_graph_params & params);

    // try to update the existing graph result using the new graph parameters in order to reuse it
    // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -734,10 +736,12 @@ public:
    ggml_tensor * t_embd_pooled = nullptr;
    ggml_tensor * t_h_nextn     = nullptr; // [n_embd, n_outputs] hidden state before final output norm

-    std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
-    std::map<llama_seq_id, ggml_tensor*> t_candidates;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
+    std::vector<ggml_tensor *> t_layer_inp;
+
+    std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
+    std::map<llama_seq_id, ggml_tensor *> t_candidates;
+    std::map<llama_seq_id, ggml_tensor *> t_sampled;
+    std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;

    std::vector<llm_graph_input_ptr> inputs;

@@ -45,6 +45,7 @@ struct llama_hparams {
    bool rope_finetuned;
    bool use_par_res;
    bool swin_norm;
+    bool norm_before_residual = false;

    uint32_t n_ctx_train; // context size the model was trained on
    uint32_t n_embd;
@@ -394,6 +394,7 @@ namespace GGUFMeta {

    template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
    template bool llama_model_loader::get_arr<std::array<int32_t, 512>>(enum llm_kv kid, std::array<int32_t, 512> & result, bool required);
+    template bool llama_model_loader::get_arr<std::vector<int32_t>>(enum llm_kv kid, std::vector<int32_t> & result, bool required);

    template<typename T>
    bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
@@ -287,6 +287,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
            return new llama_model_qwen35moe(params);
        case LLM_ARCH_MISTRAL3:
            return new llama_model_mistral3(params);
+        case LLM_ARCH_EAGLE3:
+            return new llama_model_eagle3(params);
        case LLM_ARCH_MIMO2:
            return new llama_model_mimo2(params);
        case LLM_ARCH_KIMI_LINEAR:
@@ -2238,7 +2240,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
    // TODO: move reranking logic here and generalize
    llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);

-    llm->res->set_outputs();
+    llm->res->set_outputs(params);

    return llm->res->get_gf();
 }
@@ -2406,6 +2408,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_ERNIE4_5:
        case LLM_ARCH_ERNIE4_5_MOE:
        case LLM_ARCH_MISTRAL3:
+        case LLM_ARCH_EAGLE3:
        case LLM_ARCH_MISTRAL4:
        case LLM_ARCH_LLAMA_EMBED:
        case LLM_ARCH_MAINCODER:
@@ -2600,8 +2603,9 @@ uint64_t llama_model_n_params(const llama_model * model) {

 bool llama_model_has_encoder(const llama_model * model) {
    switch (model->arch) {
-        case LLM_ARCH_T5:        return true;
-        case LLM_ARCH_T5ENCODER: return true;
+        case LLM_ARCH_T5:
+        case LLM_ARCH_T5ENCODER:
+        case LLM_ARCH_EAGLE3:    return true;
        default:                 return false;
    }
 }
@@ -2687,3 +2691,12 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
    }
 }
+
+const int32_t * llama_model_target_layer_ids(const struct llama_model * model) {
+    const auto & v = model->target_layer_ids;
+    return v.empty() ? nullptr : v.data();
+}
+
+uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) {
+    return (uint32_t) model->target_layer_ids.size();
+}
@@ -569,6 +569,13 @@ struct llama_model {
    struct ggml_tensor * per_layer_model_proj = nullptr;
    struct ggml_tensor * per_layer_proj_norm  = nullptr;

+    // eagle3
+    struct ggml_tensor * fc  = nullptr;  // feature fusion layer
+    struct ggml_tensor * d2t = nullptr;  // draft to target vocabulary mapping
+
+    // unified vector to store target-model extracted layer ids in eagle3, dflash, etc.
+    std::vector<int32_t> target_layer_ids;
+
    std::vector<llama_layer> layers;

    //Dense linear projections for SentenceTransformers models like embeddinggemma
@@ -0,0 +1,323 @@
+#include "models.h"
+
+void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) {
+        throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
+    }
+    if (target_layer_ids.size() != 3) {
+        throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
+    }
+    LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
+            target_layer_ids[0],
+            target_layer_ids[1],
+            target_layer_ids[2]);
+
+    uint32_t n_embd_tgt = 0;
+
+    ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
+    LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
+
+    hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
+
+    // eagle3 norm_before_residual (optional, default false)
+    // compatible with Readhat eagle3 speculator model
+    ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
+    if (hparams.norm_before_residual) {
+        LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__);
+    }
+
+    type = LLM_TYPE_UNKNOWN;
+}
+
+void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_embd_inp = hparams.n_embd_inp();
+    const int64_t n_embd_attn_input = 2 * n_embd;
+
+    // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
+    // d2t: draft to target vocabulary mapping
+    int64_t n_draft_vocab = n_vocab;  // Default: same as target vocab
+    const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
+    if (d2t_meta) {
+        n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
+        d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0);
+        LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
+    } else {
+        d2t = nullptr; // no d2t, use default vocab size
+        LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
+    }
+
+    // Feature fusion layer: projects 3 target layers to draft hidden size
+    fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0);
+
+    // Output layer (uses draft vocab size)
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED);
+
+    // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
+    const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
+    if (tok_embd_meta) {
+        const int64_t n_target_vocab = tok_embd_meta->ne[1];
+        tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
+        LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
+    }
+
+    // Single decoder layer
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        // input_layernorm: applied to token embeddings
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        // eagle3 specific: hidden_norm applied to fused target features
+        layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+
+        // Attention takes input_embeds_normed + fused_target_normed as input
+        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
+        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
+        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+        // rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling)
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const {
+    switch (params.gtype) {
+        case LLM_GRAPH_TYPE_ENCODER:
+            return std::make_unique<graph<true>>(*this, params);
+        case LLM_GRAPH_TYPE_DEFAULT:
+        case LLM_GRAPH_TYPE_DECODER:
+            return std::make_unique<graph<false>>(*this, params);
+        default:
+            GGML_ABORT("invalid graph type");
+    };
+}
+
+template <>
+ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
+    ggml_tensor * cur = nullptr;
+
+    // Input: Target model features (3 layers concatenated: low, mid, high)
+    // Data will be provided via ubatch->embd in encode_eagle3_features()
+    auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
+    inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
+    ggml_set_input(inp_target->embd);
+
+    cur = inp_target->embd;
+    cb(cur, "inp_embd", -1);
+
+    res->add_input(std::move(inp_target));
+
+    return cur;
+}
+
+// eagle3 Encoder: processes target model features through feature fusion layer
+// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high
+// Output: g_embeddings e.g. [4096, n_tokens] stored in context
+template <>
+llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    ggml_tensor * cur = nullptr;
+
+    cur = build_inp_embd_enc();
+
+    // Feature fusion layer
+    cur = build_lora_mm(model.fc, cur);
+    cb(cur, "fc_out", -1);
+
+    // Output: g_embeddings e.g. [4096, n_tokens]
+    // store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft)
+    ggml_set_output(cur);
+    res->t_h_nextn = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
+
+// eagle3 Decoder: processes draft tokens using g_embeddings from encoder
+// Input: draft tokens + g_embeddings from encoder
+// Output: draft logits
+template <>
+llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_layer == 1);  // eagle3 has only one decoder layer
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    // eagle3 Decoder receives:
+    // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
+    // 2. g_embeddings from encoder
+    auto * tok_embd = model.tok_embd;
+    if (model.tok_embd == nullptr) {
+        GGML_ASSERT(cparams.ctx_other != nullptr);
+        const auto * model_other = llama_get_model(cparams.ctx_other);
+
+        GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
+        tok_embd = model_other->tok_embd;
+    }
+
+    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
+
+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp->tokens);
+
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+    ggml_set_input(inp->embd);
+
+    ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens);
+    cb(inp_embd, "inp_embd", -1);
+
+    ggml_tensor * inp_g = inp->embd;
+    cb(inp_g, "inp_g_embeddings", -1);
+
+    res->add_input(std::move(inp));
+
+    inpL = inp_g;
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
+
+    // Single decoder layer (il = 0)
+    const int il = 0;
+    {
+        // Apply input_layernorm to the token embeddings
+        ggml_tensor * embd_norm = build_norm(inp_embd,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(embd_norm, "embd_norm", il);
+
+        // Apply hidden_norm to inp_g
+        ggml_tensor * g_norm = build_norm(inp_g,
+                model.layers[il].attn_norm_2, NULL,
+                LLM_NORM_RMS, -1);
+        cb(g_norm, "g_norm", il);
+
+        // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
+        // - false (default): use raw inp_g for residual
+        // - true: use normalized g_norm for residual
+        // inpL is the concatenated input (normalized inp_embd + normalized inp_g)
+        ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL;
+
+        // Concatenate normalized inp_embd and normalized inp_g
+        cur = ggml_concat(ctx0, embd_norm, g_norm, il);
+        cb(cur, "concat_embd", il);
+
+        // Self-attention with concatenated input
+        ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+        cb(Qcur, "Qcur", il);
+
+        ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+        cb(Kcur, "Kcur", il);
+
+        ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+        cb(Vcur, "Vcur", il);
+
+        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+        // rope freq factors, returns nullptr if not available
+        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+        // RoPE
+        Qcur = ggml_rope_ext(
+                ctx0, Qcur, inp_pos, rope_factors,
+                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+                );
+        Kcur = ggml_rope_ext(
+                ctx0, Kcur, inp_pos, rope_factors,
+                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+        cb(Qcur, "Qcur_rope", il);
+        cb(Kcur, "Kcur_rope", il);
+
+        cur = build_attn(inp_attn,
+                model.layers[il].wo, NULL, nullptr,
+                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+
+        // Add residual and update it
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // Apply FFN norm to the sum
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "post_attn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   NULL, NULL,
+                model.layers[il].ffn_gate, NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+
+        // Output norm with residual
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "eagle3_prenorm", il);
+
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    // Output prenorm state (for next token's g_embeddings in autoregressive generation)
+    ggml_set_output(cur);
+    res->t_h_nextn = cur;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+
+    // lm_head - projects to draft vocabulary
+    // if the draft has no own output projection, inherit the target model's lm_head
+    auto * output = model.output;
+    if (output == nullptr) {
+        GGML_ASSERT(cparams.ctx_other != nullptr);
+        const auto * model_other = llama_get_model(cparams.ctx_other);
+
+        GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)");
+        output = model_other->output;
+    }
+    cur = build_lora_mm(output, cur);
+
+    if (model.d2t) {
+        const int64_t n_draft_vocab = cur->ne[0];
+        const int64_t n_outputs     = cur->ne[1];
+        const int64_t n_vocab       = (int64_t) model.vocab.n_tokens();
+
+        GGML_ASSERT(model.d2t->type == GGML_TYPE_I64);
+        GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab);
+
+        ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY);
+        cur = ggml_set_rows(ctx0, logits,
+                ggml_reshape_3d(ctx0, cur,       1,             n_draft_vocab, n_outputs),
+                ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1,             1));
+        cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs);
+    }
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
@@ -210,6 +210,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        const int   n_rot_l      = hparams.n_rot(il);

+        res->t_layer_inp[il] = inpL;
+
        // norm
        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
        cb(cur, "attn_norm", il);
@@ -124,6 +124,8 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        ggml_tensor * inpSA = inpL;

        // norm
@@ -1089,6 +1089,21 @@ struct llama_model_glm_dsa : public llama_model_base {
    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };

+struct llama_model_eagle3 : public llama_model_base {
+    llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool is_enc>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+
+        ggml_tensor * build_inp_embd_enc() const;
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+

 struct llama_model_mistral4 : public llama_model_deepseek2 {
    llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {}
@@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);

@@ -69,6 +69,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        ggml_tensor * inpSA = inpL;

        // norm
@@ -173,7 +173,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
        }

        if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
-            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            cur   = ggml_get_rows(ctx0, cur,   inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }

@@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        ggml_tensor * inpSA = inpL;

        // norm
@@ -450,6 +450,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
        if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
            continue; // FIXME: ISWA KV cache initialization needs more fixture params
        }
+        if (arch == LLM_ARCH_EAGLE3) {
+            continue;
+        }
        for (bool moe : {false, true}) {
            if (moe && !moe_implemented(arch)) {
                continue;
@@ -553,6 +556,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
        if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
            continue; // FIXME: ISWA KV cache initialization needs more fixture params
        }
+        if (arch == LLM_ARCH_EAGLE3) {
+            continue;
+        }

        const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
        for (bool moe : {false, true}) {
@@ -169,29 +169,108 @@ bool server_http_context::init(const common_params & params) {
        SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size());
    }

+    //
+    // Helper: Generate iOS splash screen paths from device dimensions
+    // This centralizes PWA asset paths to avoid duplication across CMake, C++, and TypeScript.
+    // Source of truth: tools/ui/src/lib/constants/pwa.ts (APPLE_DEVICES)
+    //
+    auto generate_splash_endpoints = []() -> std::vector<std::string> {
+        // Apple device dimensions (width x height) with orientation and color scheme
+        // Format: "orientation-dimension1xdimension2.png" or "orientation-dark-dimension1xdimension2.png"
+        // Based on https://developer.apple.com/design/human-interface-guidelines/app-icons
+        static const std::vector<std::pair<std::string, std::string>> splash_specs = {
+            // Portrait screens (light)
+            {"portrait", "640x1136"},     {"portrait", "750x1334"},
+            {"portrait", "1170x2532"},    {"portrait", "1179x2556"},
+            {"portrait", "1206x2622"},    {"portrait", "1284x2778"},
+            {"portrait", "1290x2796"},    {"portrait", "1320x2868"},
+            {"portrait", "1488x2266"},    {"portrait", "1640x2360"},
+            {"portrait", "1668x2388"},    {"portrait", "2048x2732"},
+            // Landscape screens (light) - dimensions swapped
+            {"landscape", "1136x640"},    {"landscape", "1334x750"},
+            {"landscape", "2532x1170"},   {"landscape", "2556x1179"},
+            {"landscape", "2622x1206"},   {"landscape", "2778x1284"},
+            {"landscape", "2796x1290"},   {"landscape", "2868x1320"},
+            {"landscape", "2266x1488"},   {"landscape", "2360x1640"},
+            {"landscape", "2388x1668"},   {"landscape", "2732x2048"},
+            // Portrait screens (dark)
+            {"portrait-dark", "640x1136"}, {"portrait-dark", "750x1334"},
+            {"portrait-dark", "1170x2532"}, {"portrait-dark", "1179x2556"},
+            {"portrait-dark", "1206x2622"}, {"portrait-dark", "1284x2778"},
+            {"portrait-dark", "1290x2796"}, {"portrait-dark", "1320x2868"},
+            {"portrait-dark", "1488x2266"}, {"portrait-dark", "1640x2360"},
+            {"portrait-dark", "1668x2388"}, {"portrait-dark", "2048x2732"},
+            // Landscape screens (dark)
+            {"landscape-dark", "1136x640"}, {"landscape-dark", "1334x750"},
+            {"landscape-dark", "2532x1170"}, {"landscape-dark", "2556x1179"},
+            {"landscape-dark", "2622x1206"}, {"landscape-dark", "2778x1284"},
+            {"landscape-dark", "2796x1290"}, {"landscape-dark", "2868x1320"},
+            {"landscape-dark", "2266x1488"}, {"landscape-dark", "2360x1640"},
+            {"landscape-dark", "2388x1668"}, {"landscape-dark", "2732x2048"}
+        };
+
+        std::vector<std::string> endpoints;
+        endpoints.reserve(splash_specs.size());
+        for (const auto & [orientation, dimensions] : splash_specs) {
+            endpoints.push_back("/apple-splash-" + orientation + "-" + dimensions + ".png");
+        }
+        return endpoints;
+    };
+
    //
    // Middlewares
    //

-    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
-        static const std::unordered_set<std::string> public_endpoints = {
+    // Public endpoints list - includes health, UI, and PWA assets
+    // Source of truth for splash screen paths: tools/ui/src/lib/constants/pwa.ts (APPLE_DEVICES)
+    static const std::unordered_set<std::string> get_public_endpoints = [generate_splash_endpoints]() {
+        std::unordered_set<std::string> endpoints {
            "/health",
            "/v1/health",
            "/models",
            "/v1/models",
            "/",
            "/index.html",
-            "/bundle.js",
-            "/bundle.css",
+            // PWA assets
+            "/favicon.ico",
+            "/favicon-dark.ico",
+            "/favicon.svg",
+            "/favicon-dark.svg",
+            "/pwa-64x64.png",
+            "/pwa-192x192.png",
+            "/pwa-512x512.png",
+            "/maskable-icon-512x512.png",
+            "/apple-touch-icon-180x180.png",
+            // iOS splash screens (generated from APPLE_DEVICES in TypeScript)
+            // PWA runtime files
+            "/manifest.webmanifest",
+            "/sw.js",
+            "/version.json",
+            "/workbox-<hash>.js",
+            "/_app/version.json",
+            "/build.json"
        };
+        // Add all splash screen endpoints
+        auto splash = generate_splash_endpoints();
+        for (const auto & path : splash) {
+            endpoints.insert(path);
+        }
+        return endpoints;
+    }();

+    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
        // If API key is not set, skip validation
        if (api_keys.empty()) {
            return true;
        }

        // If path is public or static file, skip validation
-        if (public_endpoints.find(req.path) != public_endpoints.end()) {
+        if (get_public_endpoints.find(req.path) != get_public_endpoints.end()) {
+            return true;
+        }
+        // Static assets (_app/ files, workbox runtime). These are embedded at build time
+        // so no API key is needed — browsers fetch them directly.
+        if (req.path.find("/_app/") == 0 || req.path.find("/workbox-") == 0) {
            return true;
        }

@@ -315,7 +394,11 @@ bool server_http_context::init(const common_params & params) {
            }
        } else {
 #if defined(LLAMA_UI_HAS_ASSETS)
-            auto serve_asset = [](const std::string & name, const char * mime, bool with_isolation_headers) {
+            // Embedded assets are immutable — cache aggressively for PWA/sw offline support.
+            // PWA runtime files (sw.js, manifest, version.json) use no-cache for revalidation.
+            // Bundle files use Vite content hashes (bundle.<hash>.js/css) so each build
+            // produces a different filename — browsers naturally get a fresh copy on upgrade.
+            auto serve_asset_cached = [](const std::string & name, const char * mime, bool with_isolation_headers) {
                return [name, mime, with_isolation_headers](const httplib::Request & req, httplib::Response & res) {
                    const llama_ui_asset * a = llama_ui_find_asset(name.c_str());
                    if (!a) {
@@ -334,14 +417,129 @@ bool server_http_context::init(const common_params & params) {
                        res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
                        res.set_header("Cross-Origin-Opener-Policy", "same-origin");
                    }
+                    res.set_header("Cache-Control", "public, max-age=31536000, immutable");
                    res.set_content(reinterpret_cast<const char*>(a->data), a->size, mime);
                    return false;
                };
            };

-            srv->Get(params.api_prefix + "/",           serve_asset("index.html", "text/html; charset=utf-8",              true));
-            srv->Get(params.api_prefix + "/bundle.js",  serve_asset("bundle.js",  "application/javascript; charset=utf-8", false));
-            srv->Get(params.api_prefix + "/bundle.css", serve_asset("bundle.css", "text/css; charset=utf-8",               false));
+            auto serve_asset_nocache = [](const std::string & name, const char * mime, bool with_isolation_headers) {
+                return [name, mime, with_isolation_headers](const httplib::Request & /*req*/, httplib::Response & res) {
+                    const llama_ui_asset * a = llama_ui_find_asset(name.c_str());
+                    if (!a) {
+                        res.status = 404;
+                        return false;
+                    }
+                    if (with_isolation_headers) {
+                        res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                        res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                    }
+                    res.set_header("Cache-Control", "no-cache");
+                    res.set_content(reinterpret_cast<const char*>(a->data), a->size, mime);
+                    return false;
+                };
+            };
+
+            // Bundle files in _app/immutable/ — SvelteKit outputs them here and index.html
+            // and sw.js reference them via these paths (vanilla build, no plugin).
+            auto serve_bundle = [serve_asset_cached](const httplib::Request & req, httplib::Response & res) {
+                std::string path = req.path;
+                std::string name;
+                const char * mime;
+                if (path.rfind("/_app/immutable/bundle.", 0) == 0 && path.size() > 22) {
+                    name = path.substr(1);  // strip leading /
+                    mime = "application/javascript; charset=utf-8";
+                } else if (path.rfind("/_app/immutable/assets/bundle.", 0) == 0 && path.size() > 30) {
+                    name = path.substr(1);  // strip leading /
+                    mime = "text/css; charset=utf-8";
+                } else {
+                    res.status = 404;
+                    return false;
+                }
+                return serve_asset_cached(name, mime, false)(req, res);
+            };
+
+            // _app/ paths — vanilla SvelteKit output, index.html and sw.js reference
+            // bundles and version.json here directly.
+            srv->Get(params.api_prefix + R"(/_app/immutable/bundle\.[^/]+\.js)",  serve_bundle);
+            srv->Get(params.api_prefix + R"(/_app/immutable/assets/bundle\.[^/]+\.css)", serve_bundle);
+            srv->Get(params.api_prefix + "/_app/version.json",                    serve_asset_cached("_app/version.json", "application/json; charset=utf-8", false));
+
+            auto serve_workbox = [serve_asset_cached](const httplib::Request & req, httplib::Response & res) {
+                std::string name = req.path.substr(1);
+                if (name.rfind("workbox-", 0) == 0 && name.size() > 10) {
+                    return serve_asset_cached(name, "application/javascript; charset=utf-8", false)(req, res);
+                }
+                res.status = 404;
+                return false;
+            };
+            srv->Get(params.api_prefix + R"(/workbox-[^/]+\.js)",               serve_workbox);
+            srv->Get(params.api_prefix + R"(/sw\.js)",                          serve_asset_cached("sw.js",               "application/javascript; charset=utf-8", false));
+            srv->Get(params.api_prefix + "/manifest.webmanifest",                serve_asset_cached("manifest.webmanifest", "application/manifest+json; charset=utf-8", false));
+            srv->Get(params.api_prefix + "/version.json",                        serve_asset_cached("_app/version.json",  "application/json; charset=utf-8",       false));
+            srv->Get(params.api_prefix + "/build.json",                          serve_asset_cached("build.json",         "application/json; charset=utf-8",       false));
+
+            // Finally serve index.html for all other routes (SPA fallback)
+            srv->Get(params.api_prefix + "/",                               serve_asset_cached("index.html",                   "text/html; charset=utf-8",                 true));
+            srv->Get(params.api_prefix + "/favicon.ico",                    serve_asset_cached("favicon.ico",                  "image/x-icon",                             false));
+            srv->Get(params.api_prefix + "/favicon-dark.ico",                serve_asset_cached("favicon-dark.ico",              "image/x-icon",                             false));
+            srv->Get(params.api_prefix + "/favicon.svg",                    serve_asset_cached("favicon.svg",                  "image/svg+xml",                            false));
+            srv->Get(params.api_prefix + "/favicon-dark.svg",              serve_asset_cached("favicon-dark.svg",            "image/svg+xml",                            false));
+            srv->Get(params.api_prefix + "/pwa-64x64.png",                  serve_asset_cached("pwa-64x64.png",                "image/png",                                false));
+            srv->Get(params.api_prefix + "/pwa-192x192.png",                serve_asset_cached("pwa-192x192.png",              "image/png",                                false));
+            srv->Get(params.api_prefix + "/pwa-512x512.png",                serve_asset_cached("pwa-512x512.png",              "image/png",                                false));
+            srv->Get(params.api_prefix + "/maskable-icon-512x512.png",      serve_asset_cached("maskable-icon-512x512.png",    "image/png",                                false));
+            srv->Get(params.api_prefix + "/apple-touch-icon-180x180.png",   serve_asset_cached("apple-touch-icon-180x180.png", "image/png",                                false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-640x1136.png",          serve_asset_cached("apple-splash-portrait-640x1136.png",          "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-1136x640.png",         serve_asset_cached("apple-splash-landscape-1136x640.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-750x1334.png",          serve_asset_cached("apple-splash-portrait-750x1334.png",          "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-1334x750.png",         serve_asset_cached("apple-splash-landscape-1334x750.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1170x2532.png",         serve_asset_cached("apple-splash-portrait-1170x2532.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2532x1170.png",        serve_asset_cached("apple-splash-landscape-2532x1170.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1179x2556.png",         serve_asset_cached("apple-splash-portrait-1179x2556.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2556x1179.png",        serve_asset_cached("apple-splash-landscape-2556x1179.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1206x2622.png",         serve_asset_cached("apple-splash-portrait-1206x2622.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2622x1206.png",        serve_asset_cached("apple-splash-landscape-2622x1206.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1284x2778.png",         serve_asset_cached("apple-splash-portrait-1284x2778.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2778x1284.png",        serve_asset_cached("apple-splash-landscape-2778x1284.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1290x2796.png",         serve_asset_cached("apple-splash-portrait-1290x2796.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2796x1290.png",        serve_asset_cached("apple-splash-landscape-2796x1290.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1320x2868.png",         serve_asset_cached("apple-splash-portrait-1320x2868.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2868x1320.png",        serve_asset_cached("apple-splash-landscape-2868x1320.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1488x2266.png",         serve_asset_cached("apple-splash-portrait-1488x2266.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2266x1488.png",        serve_asset_cached("apple-splash-landscape-2266x1488.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1640x2360.png",         serve_asset_cached("apple-splash-portrait-1640x2360.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2360x1640.png",        serve_asset_cached("apple-splash-landscape-2360x1640.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-1668x2388.png",         serve_asset_cached("apple-splash-portrait-1668x2388.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2388x1668.png",        serve_asset_cached("apple-splash-landscape-2388x1668.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-2048x2732.png",         serve_asset_cached("apple-splash-portrait-2048x2732.png",         "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-2732x2048.png",        serve_asset_cached("apple-splash-landscape-2732x2048.png",        "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-640x1136.png",     serve_asset_cached("apple-splash-portrait-dark-640x1136.png",     "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-1136x640.png",    serve_asset_cached("apple-splash-landscape-dark-1136x640.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-750x1334.png",     serve_asset_cached("apple-splash-portrait-dark-750x1334.png",     "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-1334x750.png",    serve_asset_cached("apple-splash-landscape-dark-1334x750.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1170x2532.png",    serve_asset_cached("apple-splash-portrait-dark-1170x2532.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2532x1170.png",   serve_asset_cached("apple-splash-landscape-dark-2532x1170.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1179x2556.png",    serve_asset_cached("apple-splash-portrait-dark-1179x2556.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2556x1179.png",   serve_asset_cached("apple-splash-landscape-dark-2556x1179.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1206x2622.png",    serve_asset_cached("apple-splash-portrait-dark-1206x2622.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2622x1206.png",   serve_asset_cached("apple-splash-landscape-dark-2622x1206.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1284x2778.png",    serve_asset_cached("apple-splash-portrait-dark-1284x2778.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2778x1284.png",   serve_asset_cached("apple-splash-landscape-dark-2778x1284.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1290x2796.png",    serve_asset_cached("apple-splash-portrait-dark-1290x2796.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2796x1290.png",   serve_asset_cached("apple-splash-landscape-dark-2796x1290.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1320x2868.png",    serve_asset_cached("apple-splash-portrait-dark-1320x2868.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2868x1320.png",   serve_asset_cached("apple-splash-landscape-dark-2868x1320.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1640x2360.png",    serve_asset_cached("apple-splash-portrait-dark-1640x2360.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2360x1640.png",   serve_asset_cached("apple-splash-landscape-dark-2360x1640.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1668x2388.png",    serve_asset_cached("apple-splash-portrait-dark-1668x2388.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2388x1668.png",   serve_asset_cached("apple-splash-landscape-dark-2388x1668.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-portrait-dark-2048x2732.png",    serve_asset_cached("apple-splash-portrait-dark-2048x2732.png",    "image/png", false));
+            srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2732x2048.png",   serve_asset_cached("apple-splash-landscape-dark-2732x2048.png",   "image/png", false));
+            srv->Get(params.api_prefix + "/manifest.webmanifest",           serve_asset_nocache("manifest.webmanifest",        "application/manifest+json",                false));
+            srv->Get(params.api_prefix + "/sw.js",                          serve_asset_nocache("sw.js",                       "application/javascript; charset=utf-8",    false));
+            srv->Get(params.api_prefix + "/version.json",                   serve_asset_nocache("version.json",                 "application/json",                         false));
+
 #endif
        }
    }
@@ -26,7 +26,7 @@ def test_access_static_assets_without_api_key():
    """Static web UI assets should not require API key authentication (issue #21229)"""
    global server
    server.start()
-    for path in ["/", "/bundle.js", "/bundle.css"]:
+    for path in ["/", "/sw.js", "/manifest.webmanifest", "/_app/version.json"]:
        res = server.make_request("GET", path)
        assert res.status_code == 200, f"Expected 200 for {path}, got {res.status_code}"

@@ -8,6 +8,8 @@ node_modules
 .wrangler
 /.svelte-kit
 /build
+dev-dist
+dist

 # OS
 .DS_Store
@@ -23,6 +25,15 @@ Thumbs.db
 vite.config.js.timestamp-*
 vite.config.ts.timestamp-*

+# PWA Artifacts
+apple-splash-*.png
+apple-touch-icon-*.png
+favicon.ico
+favicon-dark.ico
+maskable-icon-*.png
+pwa-*.png
+
+# Storybook
 *storybook.log
 storybook-static
 *.code-workspace
@@ -77,6 +77,7 @@ add_custom_target(llama-ui-assets ALL
        "-DUI_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}"
        "-DUI_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}"
        "-DLLAMA_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
+        "-DLLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}"
        "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}"
        "-DHF_VERSION=${HF_UI_VERSION}"
        "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}"
@@ -4,8 +4,9 @@
 	"version": "1.0.0",
 	"type": "module",
 	"scripts": {
+		"build": "npm run build-pwa-assets && vite build",
+		"build-pwa-assets": "npx @vite-pwa/assets-generator --root . --config pwa-assets.config.ts && npx @vite-pwa/assets-generator --root . --config pwa-assets-dark.config.ts && node scripts/make-icons-circular.js",
 		"dev": "bash scripts/dev.sh",
-		"build": "vite build",
 		"preview": "vite preview",
 		"prepare": "svelte-kit sync || echo ''",
 		"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
@@ -15,12 +16,15 @@
 		"lint": "prettier --check . && eslint .",
 		"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e",
 		"test:e2e": "playwright test",
+		"test:e2e:pwa": "playwright test tests/e2e/pwa.e2e.ts",
 		"test:client": "vitest --project=client",
 		"test:unit": "vitest --project=unit",
+		"test:unit:pwa": "vitest --project=unit --run tests/unit/pwa.spec.ts",
+		"test:pwa": "npm run test:unit:pwa && npm run test:e2e:pwa",
 		"test:ui": "vitest --project=ui",
 		"storybook": "storybook dev -p 6006",
 		"build-storybook": "storybook build",
-		"cleanup": "rm -rf .svelte-kit build node_modules test-results"
+		"cleanup": "rm -rf .svelte-kit build node_modules test-results dist dev-dist debug-storybook.log static/pwa-*.png static/maskable-icon-*.png static/apple-touch-icon-*.png static/apple-splash-*.png static/favicon*.ico"
 	},
 	"devDependencies": {
 		"@chromatic-com/storybook": "5.0.0",
@@ -41,29 +45,31 @@
 		"@tailwindcss/forms": "0.5.10",
 		"@tailwindcss/typography": "0.5.16",
 		"@tailwindcss/vite": "4.1.11",
-		"@types/node": "^24",
+		"@types/node": "24.13.0",
+		"@vite-pwa/assets-generator": "1.0.2",
+		"@vite-pwa/sveltekit": "1.1.0",
 		"@vitest/browser": "4.1.8",
 		"@vitest/browser-playwright": "4.1.8",
 		"@vitest/coverage-v8": "4.1.8",
 		"bits-ui": "2.18.1",
 		"clsx": "2.1.1",
-		"dexie": "4.0.11",
-		"eslint": "9.39.2",
+		"dexie": "4.4.3",
+		"eslint": "9.39.4",
 		"eslint-config-prettier": "10.1.8",
-		"eslint-plugin-storybook": "10.2.4",
-		"eslint-plugin-svelte": "3.15.0",
-		"globals": "16.3.0",
+		"eslint-plugin-storybook": "10.4.2",
+		"eslint-plugin-svelte": "3.19.0",
+		"globals": "16.5.0",
 		"highlight.js": "11.11.1",
 		"http-server": "14.1.1",
 		"mdast": "3.0.0",
-		"mdsvex": "0.12.6",
+		"mdsvex": "0.12.7",
 		"mermaid": "11.15.0",
 		"mode-watcher": "1.1.0",
 		"pdfjs-dist": "5.4.54",
 		"playwright": "1.56.1",
-		"prettier": "3.6.2",
-		"prettier-plugin-svelte": "3.4.0",
-		"prettier-plugin-tailwindcss": "0.6.14",
+		"prettier": "3.8.3",
+		"prettier-plugin-svelte": "4.1.0",
+		"prettier-plugin-tailwindcss": "0.8.0",
 		"rehype-highlight": "7.0.2",
 		"rehype-katex": "7.0.1",
 		"rehype-stringify": "10.0.1",
@@ -73,25 +79,25 @@
 		"remark-html": "16.0.1",
 		"remark-math": "6.0.0",
 		"remark-rehype": "11.1.2",
-		"sass": "1.93.3",
-		"storybook": "10.3.3",
-		"svelte": "5.55.7",
-		"svelte-check": "4.3.0",
-		"svelte-sonner": "1.0.5",
-		"tailwind-merge": "3.3.1",
+		"sass": "1.100.0",
+		"storybook": "10.4.2",
+		"svelte": "5.56.1",
+		"svelte-check": "4.6.0",
+		"svelte-sonner": "1.1.1",
+		"tailwind-merge": "3.6.0",
 		"tailwind-variants": "3.2.2",
-		"tailwindcss": "4.1.11",
-		"tw-animate-css": "1.3.5",
-		"typescript": "5.8.3",
-		"typescript-eslint": "8.56.0",
+		"tailwindcss": "4.3.0",
+		"tw-animate-css": "1.4.0",
+		"typescript": "5.9.3",
+		"typescript-eslint": "8.60.1",
 		"unified": "11.0.5",
-		"unist-util-visit": "5.0.0",
+		"unist-util-visit": "5.1.0",
 		"uuid": "13.0.2",
-		"vite": "7.3.2",
+		"vite": "7.3.5",
 		"vite-plugin-devtools-json": "0.2.1",
 		"vitest": "4.1.8",
 		"vitest-browser-svelte": "2.1.1",
-		"zod": "4.2.1"
+		"workbox-window": "7.4.1"
 	},
 	"overrides": {
 		"cookie": "1.1.1"
@@ -1,11 +1,31 @@
-import { defineConfig } from '@playwright/test';
+import { defineConfig, devices } from '@playwright/test';

 export default defineConfig({
+	testDir: 'tests/e2e',
+	testMatch: ['**/*.e2e.ts'],
+	timeout: 30000,
+	expect: {
+		timeout: 5000
+	},
+	fullyParallel: true,
+	forbidOnly: !!process.env.CI,
+	retries: process.env.CI ? 2 : 0,
+	workers: process.env.CI ? 1 : undefined,
+	reporter: 'line',
+	use: {
+		baseURL: 'http://localhost:8181',
+		trace: 'on-first-retry'
+	},
+	projects: [
+		{
+			name: 'chromium',
+			use: { ...devices['Desktop Chrome'] }
+		}
+	],
 	webServer: {
 		command: 'npm run build && npx http-server ./dist -p 8181',
 		port: 8181,
 		timeout: 120000,
-		reuseExistingServer: false
-	},
-	testDir: 'tests/e2e'
+		reuseExistingServer: !process.env.CI
+	}
 });
@@ -0,0 +1,20 @@
+import { defineConfig } from '@vite-pwa/assets-generator/config';
+
+export default defineConfig({
+	headLinkOptions: {
+		preset: '2023'
+	},
+	preset: {
+		transparent: {
+			sizes: [],
+			favicons: [[48, 'favicon-dark.ico']]
+		},
+		maskable: {
+			sizes: []
+		},
+		apple: {
+			sizes: []
+		}
+	},
+	images: ['static/favicon-dark.svg']
+});
@@ -0,0 +1,51 @@
+import {
+	combinePresetAndAppleSplashScreens,
+	defineConfig,
+	minimal2023Preset
+} from '@vite-pwa/assets-generator/config';
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+import { THEME_COLORS, PWA_GENERATOR_DEVICES, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
+import { SplashOrientation } from './src/lib/enums/splash.enums';
+
+export default defineConfig({
+	headLinkOptions: {
+		preset: PWA_ASSET_GENERATOR.LINK_PRESET
+	},
+	preset: combinePresetAndAppleSplashScreens(
+		minimal2023Preset,
+		{
+			padding: PWA_ASSET_GENERATOR.SPLASH_PADDING,
+			resizeOptions: {
+				background: THEME_COLORS.BACKGROUND_LIGHT,
+				fit: PWA_ASSET_GENERATOR.FIT_MODE
+			},
+			darkResizeOptions: {
+				background: THEME_COLORS.BACKGROUND_DARK,
+				fit: PWA_ASSET_GENERATOR.FIT_MODE
+			},
+			darkImageResolver: async (imageName: string) => {
+				if (imageName.endsWith('favicon.svg')) {
+					return readFileSync(resolve('static/favicon-dark.svg'));
+				}
+			},
+			linkMediaOptions: {
+				log: true,
+				addMediaScreen: PWA_ASSET_GENERATOR.ADD_MEDIA_SCREEN,
+				basePath: PWA_ASSET_GENERATOR.BASE_PATH,
+				xhtml: PWA_ASSET_GENERATOR.XHTML
+			},
+			png: {
+				compressionLevel: PWA_ASSET_GENERATOR.PNG_COMPRESSION_LEVEL,
+				quality: PWA_ASSET_GENERATOR.PNG_QUALITY
+			},
+			name: (landscape, size, dark) => {
+				const orientation = landscape ? SplashOrientation.LANDSCAPE : SplashOrientation.PORTRAIT;
+				const darkPrefix = dark ? PWA_ASSET_GENERATOR.DARK_PREFIX : '';
+				return `apple-splash-${orientation}-${darkPrefix}${size.width}x${size.height}.png`;
+			}
+		},
+		PWA_GENERATOR_DEVICES
+	),
+	images: ['static/favicon.svg']
+});
@@ -0,0 +1,137 @@
+#!/usr/bin/env node
+
+/**
+ * Apply circular mask to pwa-*.png icons.
+ * Uses the maskable icon as source (white bg, full logo) to avoid
+ * the small-colormap pwa icons looking bad when cropped to a circle.
+ *
+ * Usage: node scripts/make-icons-circular.js [--padding-pct <0-50>] [--scale-pct <50-100>]
+ *
+ * - padding-pct: percentage of icon size kept as padding around the circle (default: 25)
+ * - scale-pct: scale down the source image before cropping (default: 85)
+ *
+ * maskable-icon and apple-touch-icon are left untouched.
+ */
+
+import sharp from 'sharp';
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+const STATIC_DIR = path.resolve(__dirname, '..', 'static');
+
+const paddingPct = process.argv.reduce((acc, arg, i, args) => {
+	if (arg === '--padding-pct' && args[i + 1]) return parseFloat(args[i + 1]);
+	return acc;
+}, 0);
+
+// Scale down the source image before cropping to circle
+const scalePct = process.argv.reduce((acc, arg, i, args) => {
+	if (arg === '--scale-pct' && args[i + 1]) return parseFloat(args[i + 1]);
+	return acc;
+}, 85); // default 85% - icon fills 85% of the circular area
+
+// Source for circular icons: the maskable icon (white bg, full logo)
+const sourceIcon = 'maskable-icon-512x512.png';
+const targetIcons = ['pwa-64x64.png', 'pwa-192x192.png', 'pwa-512x512.png'];
+
+// maskable-icon and apple-touch-icon stay square
+const untouchedIcons = ['maskable-icon-512x512.png', 'apple-touch-icon-180x180.png'];
+
+async function makeCircle(targetFilename) {
+	const targetPath = path.join(STATIC_DIR, targetFilename);
+	const sourcePath = path.join(STATIC_DIR, sourceIcon);
+
+	if (!fs.existsSync(sourcePath)) {
+		console.log(`⏭️  ${sourceIcon} not found, skipping`);
+		return;
+	}
+	if (!fs.existsSync(targetPath)) {
+		console.log(`⏭️  ${targetFilename} not found, skipping`);
+		return;
+	}
+
+	const metadata = await sharp(targetPath).metadata();
+	const size = Math.max(metadata.width, metadata.height);
+	const radius = Math.floor((size * (1 - paddingPct / 100)) / 2);
+	const center = Math.floor(size / 2);
+
+	// Build circular mask as RGBA buffer: white opaque circle on transparent bg
+	const maskBuf = Buffer.alloc(size * size * 4, 0);
+	for (let y = 0; y < size; y++) {
+		for (let x = 0; x < size; x++) {
+			const dx = x - center;
+			const dy = y - center;
+			const dist = Math.sqrt(dx * dx + dy * dy);
+			if (dist < radius) {
+				const i = (y * size + x) * 4;
+				maskBuf[i] = 255;
+				maskBuf[i + 1] = 255;
+				maskBuf[i + 2] = 255;
+				maskBuf[i + 3] = 255;
+			}
+		}
+	}
+
+	const tmpMask = path.join(STATIC_DIR, '.mask-tmp.png');
+	await sharp(maskBuf, {
+		raw: { width: size, height: size, channels: 4 }
+	})
+		.png()
+		.toFile(tmpMask);
+
+	// Step 1: Scale source relative to circle diameter (not full icon), composite centered onto white canvas of full size
+	const circleDiameter = Math.floor(size * (1 - paddingPct / 100));
+	const scaledSize = Math.floor((circleDiameter * scalePct) / 100);
+	const offset = Math.floor((size - scaledSize) / 2);
+
+	const scaledBuf = await sharp(sourcePath)
+		.resize(scaledSize, scaledSize, {
+			fit: 'cover',
+			background: { r: 255, g: 255, b: 255, alpha: 1 }
+		})
+		.ensureAlpha()
+		.png()
+		.toBuffer();
+
+	// Step 2: Composite scaled image onto white background, then apply circular mask
+	const output = await sharp({
+		create: {
+			width: size,
+			height: size,
+			channels: 4,
+			background: { r: 255, g: 255, b: 255, alpha: 1 }
+		}
+	})
+		.composite([
+			{ input: scaledBuf, top: offset, left: offset },
+			{ input: tmpMask, top: 0, left: 0, blend: 'dest-in' }
+		])
+		.png()
+		.toBuffer();
+
+	fs.writeFileSync(targetPath, output);
+	fs.unlinkSync(tmpMask);
+
+	console.log(
+		`✓ ${targetFilename} → circle from ${sourceIcon}, ${paddingPct}% padding (size=${size}, r=${radius}, scale=${scalePct}%, circleDiameter=${circleDiameter})`
+	);
+}
+
+async function main() {
+	console.log(`Circular mask: ${paddingPct}% padding, ${scalePct}% scale, source=${sourceIcon}\n`);
+	for (const icon of targetIcons) {
+		await makeCircle(icon);
+	}
+
+	console.log('\nUnchanged:');
+	for (const icon of untouchedIcons) {
+		const fp = path.join(STATIC_DIR, icon);
+		console.log(`  ${icon} (${fs.existsSync(fp) ? fs.statSync(fp).size + ' bytes' : 'missing'})`);
+	}
+}
+
+main();
@@ -0,0 +1,42 @@
+import { writeFileSync, existsSync } from 'node:fs';
+import { resolve } from 'path';
+import type { Plugin } from 'vite';
+import { BUILD_CONFIG } from '../src/lib/constants/pwa';
+
+let processed = false;
+
+const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
+
+/**
+ * Write build.json with the llama.cpp release build number.
+ *
+ * LLAMA_BUILD_NUMBER is passed from CMake -> npm -> vite via env var.
+ * Used for display of the current llama-server release (e.g. "b1234").
+ */
+export function buildInfoPlugin(): Plugin {
+	return {
+		name: 'llamacpp:build-info',
+		apply: 'build',
+		closeBundle() {
+			setTimeout(() => {
+				try {
+					if (processed) return;
+					processed = true;
+
+					const buildNumber = process.env.LLAMA_BUILD_NUMBER;
+					if (!buildNumber) return;
+
+					const outDir = resolve(OUTPUT_DIR);
+					const indexPath = resolve(outDir, 'index.html');
+					if (!existsSync(indexPath)) return;
+
+					const buildJsonPath = resolve(outDir, 'build.json');
+					writeFileSync(buildJsonPath, JSON.stringify({ version: buildNumber }), 'utf-8');
+					console.log(`Created build.json (version: ${buildNumber})`);
+				} catch (error) {
+					console.error('Failed to write build.json:', error);
+				}
+			}, 100);
+		}
+	};
+}
@@ -1,105 +0,0 @@
-import {
-	readFileSync,
-	writeFileSync,
-	existsSync,
-	readdirSync,
-	copyFileSync,
-	rmSync,
-	unlinkSync
-} from 'fs';
-import { resolve } from 'path';
-import type { Plugin } from 'vite';
-
-const GUIDE_FOR_FRONTEND = `
-<!--
-  This is a static build of the frontend.
-  It is automatically generated by the build process.
-  Do not edit this file directly.
-  To make changes, refer to the "Web UI" section in the README.
-->
-`.trim();
-
-const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? './dist';
-
-export function llamaCppBuildPlugin(): Plugin {
-	return {
-		name: 'llamacpp:build',
-		apply: 'build',
-		closeBundle() {
-			setTimeout(() => {
-				try {
-					const outDir = resolve(OUTPUT_DIR);
-					const indexPath = resolve(outDir, 'index.html');
-					if (!existsSync(indexPath)) return;
-
-					let content = readFileSync(indexPath, 'utf-8');
-
-					// Inline favicon as base64 data URL
-					const faviconPath = resolve('static/favicon.svg');
-					if (existsSync(faviconPath)) {
-						const faviconContent = readFileSync(faviconPath, 'utf-8');
-						const faviconBase64 = Buffer.from(faviconContent).toString('base64');
-						const faviconDataUrl = `data:image/svg+xml;base64,${faviconBase64}`;
-						content = content.replace(/href="[^"]*favicon\.svg"/g, `href="${faviconDataUrl}"`);
-						console.log('✓ Inlined favicon.svg as base64 data URL');
-					}
-
-					content = content.replace(/\r/g, '');
-					content = GUIDE_FOR_FRONTEND + '\n' + content;
-
-					// Keep the Vite hash as a query string so each build busts the browser cache
-					content = content.replace(/\/_app\/immutable\/bundle\.([^".]+)\.js/g, './bundle.js?$1');
-					content = content.replace(
-						/\/_app\/immutable\/assets\/bundle\.([^".]+)\.css/g,
-						'./bundle.css?$1'
-					);
-					content = content.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__');
-
-					writeFileSync(indexPath, content, 'utf-8');
-					console.log('✓ Updated index.html');
-
-					// Copy bundle.*.js -> bundle.js at output root
-					const immutableDir = resolve(outDir, '_app/immutable');
-					const bundleDir = resolve(outDir, '_app/immutable/assets');
-
-					if (existsSync(immutableDir)) {
-						const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/));
-						if (jsFiles.length > 0) {
-							copyFileSync(resolve(immutableDir, jsFiles[0]), resolve(outDir, 'bundle.js'));
-							// Normalize __sveltekit_<hash> to __sveltekit__ in bundle.js
-							const bundleJsPath = resolve(outDir, 'bundle.js');
-							let bundleJs = readFileSync(bundleJsPath, 'utf-8');
-							bundleJs = bundleJs.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__');
-							writeFileSync(bundleJsPath, bundleJs, 'utf-8');
-							console.log(`✓ Copied ${jsFiles[0]} -> bundle.js`);
-						}
-					}
-
-					// Copy bundle.*.css -> bundle.css at output root
-					if (existsSync(bundleDir)) {
-						const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/));
-						if (cssFiles.length > 0) {
-							copyFileSync(resolve(bundleDir, cssFiles[0]), resolve(outDir, 'bundle.css'));
-							console.log(`✓ Copied ${cssFiles[0]} -> bundle.css`);
-						}
-					}
-
-					// Cleanup: remove _app directory, favicon.svg, and legacy index.html.gz
-					const appDir = resolve(outDir, '_app');
-					if (existsSync(appDir)) {
-						rmSync(appDir, { recursive: true, force: true });
-						console.log('✓ Removed _app directory');
-					}
-
-					const faviconOut = resolve(outDir, 'favicon.svg');
-					if (existsSync(faviconOut)) {
-						unlinkSync(faviconOut);
-						console.log('✓ Removed favicon.svg');
-					}
-				} catch (error) {
-					console.error('Failed to process build output:', error);
-				}
-			}, 100);
-		}
-	};
-}
@@ -0,0 +1,61 @@
+import { readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { resolve } from 'path';
+import type { Plugin } from 'vite';
+import { BUILD_CONFIG } from '../src/lib/constants/pwa';
+
+let processed = false;
+
+const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
+
+function rewrite(path: string, pairs: [string, string][]): void {
+	if (!existsSync(path)) {
+		return;
+	}
+	const text = readFileSync(path, 'utf-8');
+	let out = text;
+	for (const [from, to] of pairs) {
+		out = out.split(from).join(to);
+	}
+	if (out !== text) {
+		writeFileSync(path, out, 'utf-8');
+	}
+}
+
+/**
+ * Relativize SvelteKit absolute base refs so the build is relocatable under any subpath.
+ *
+ * SvelteKit bakes root absolute /_app/ paths into the SPA fallback because paths.relative
+ * does not apply to a depth agnostic fallback page. Rewriting to ./_app/ lets a plain
+ * recursive copy of the output into /any/subdir/ resolve assets against the document URL.
+ * Runs after adapter-static writes index.html and the PWA plugin writes sw.js, deferred the
+ * same way as buildInfoPlugin so the emitted files exist.
+ */
+export function relativizeBasePlugin(): Plugin {
+	return {
+		name: 'llamacpp:relativize-base',
+		apply: 'build',
+		closeBundle() {
+			setTimeout(() => {
+				try {
+					if (processed) return;
+					processed = true;
+
+					const outDir = resolve(OUTPUT_DIR);
+
+					// index.html: modulepreload, stylesheet and bootstrap import reference "/_app/
+					rewrite(resolve(outDir, 'index.html'), [['"/_app/', '"./_app/']]);
+
+					// sw.js: the only absolute entries are the navigate fallback precache key and handler
+					rewrite(resolve(outDir, 'sw.js'), [
+						['{url:"/"', '{url:"./"'],
+						['createHandlerBoundToURL("/"', 'createHandlerBoundToURL("./"']
+					]);
+
+					console.log('Relativized base refs in index.html and sw.js');
+				} catch (error) {
+					console.error('Failed to relativize base refs:', error);
+				}
+			}, 100);
+		}
+	};
+}
@@ -0,0 +1,115 @@
+import { readdirSync, readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { resolve } from 'path';
+import type { Plugin } from 'vite';
+import { TAB, NEWLINE } from '../src/lib/constants/code';
+import { APPLE_DEVICES, BUILD_CONFIG, REGEX_PATTERNS, SPLASH_LINK } from '../src/lib/constants/pwa';
+import type { SplashDimensions } from '../src/lib/types';
+import { SplashOrientation } from '../src/lib/enums/splash.enums';
+
+let processed = false;
+
+const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
+
+/**
+ * Generate iOS splash screen <link> tags from generated apple-splash-*.png files.
+ * Returns an array of HTML link strings to be injected into the page head.
+ */
+export function generateSplashScreenLinks(outDir: string): string[] {
+	const files = readdirSync(outDir).filter((f) => f.match(REGEX_PATTERNS.SPLASH_FILE));
+	if (files.length === 0) return [];
+
+	const dimMap = new Map<string, SplashDimensions>();
+	for (const [dims, spec] of Object.entries(APPLE_DEVICES)) {
+		const [w, h] = dims.split('x').map(Number);
+		// logical-point dimensions
+		dimMap.set(`${w}x${h}`, { deviceW: spec.width, deviceH: spec.height, dpr: spec.dpr });
+		dimMap.set(`${h}x${w}`, { deviceW: spec.width, deviceH: spec.height, dpr: spec.dpr });
+		// pixel dimensions (used by actual generated splash files)
+		dimMap.set(`${w * spec.dpr}x${h * spec.dpr}`, {
+			deviceW: spec.width,
+			deviceH: spec.height,
+			dpr: spec.dpr
+		});
+		dimMap.set(`${h * spec.dpr}x${w * spec.dpr}`, {
+			deviceW: spec.width,
+			deviceH: spec.height,
+			dpr: spec.dpr
+		});
+	}
+
+	const lightLinks: string[] = [];
+	const darkLinks: string[] = [];
+
+	for (const file of files) {
+		const match = file.match(REGEX_PATTERNS.SPLASH_FILE);
+		if (!match) continue;
+		const orientation = match[1] as SplashOrientation;
+		const isDark = !!match[2];
+		const pixelW = parseInt(match[3]);
+		const pixelH = parseInt(match[4]);
+
+		const key = `${pixelW}x${pixelH}`;
+		const spec = dimMap.get(key);
+		if (!spec) {
+			console.warn(`Unknown splash screen dimensions: ${key} (${file})`);
+			continue;
+		}
+
+		const { deviceW, deviceH, dpr } = spec;
+		const media = `screen and (device-width: ${deviceW}px) and (device-height: ${deviceH}px) and (-webkit-device-pixel-ratio: ${dpr}) and (orientation: ${orientation})`;
+		const href = `./${file}`;
+
+		if (isDark) {
+			darkLinks.push(
+				`${SPLASH_LINK.HTML} media="${media}${SPLASH_LINK.DARK_MEDIA_SUFFIX}" href="${href}">`
+			);
+		} else {
+			lightLinks.push(`${SPLASH_LINK.HTML} media="${media}" href="${href}">`);
+		}
+	}
+
+	return [...lightLinks, ...darkLinks];
+}
+
+export function splashScreenPlugin(): Plugin {
+	return {
+		name: 'llamacpp:splash-screen',
+		apply: 'build',
+		closeBundle() {
+			setTimeout(() => {
+				try {
+					if (processed) return;
+					processed = true;
+
+					const outDir = resolve(OUTPUT_DIR);
+					const indexPath = resolve(outDir, 'index.html');
+					if (!existsSync(indexPath)) return;
+
+					let content = readFileSync(indexPath, 'utf-8');
+
+					// Inject iOS splash screen <link> tags into <head>.
+					// The @vite-pwa/assets-generator generates apple-splash-*.png files;
+					// this scans them and creates the <link> tags SvelteKit needs.
+					const splashLinks = generateSplashScreenLinks(outDir);
+					if (splashLinks.length > 0) {
+						console.log(`Generated ${splashLinks.length} apple-splash link tags`);
+						const splashHtml = splashLinks.map((l) => TAB + TAB + l).join(NEWLINE);
+						content = content.replace(
+							REGEX_PATTERNS.HEAD_CLOSE,
+							splashHtml + NEWLINE + TAB + TAB + '</head>'
+						);
+					}
+
+					// Remove trailing \r from Windows line endings
+					content = content.replace(/\r/g, '');
+					content = BUILD_CONFIG.GUIDE_COMMENT + NEWLINE + content;
+
+					writeFileSync(indexPath, content, 'utf-8');
+					console.log('Updated index.html');
+				} catch (error) {
+					console.error('Failed to process build output:', error);
+				}
+			}, 100);
+		}
+	};
+}
@@ -1,6 +1,9 @@
 // See https://svelte.dev/docs/kit/types#app.d.ts
 // for information about these interfaces

+import 'vite-plugin-pwa/pwa-assets';
+import 'vite-plugin-pwa/svelte';
+
 // Import chat types from dedicated module

 import type {
@@ -2,10 +2,17 @@
 <html lang="en">
 	<head>
 		<meta charset="utf-8" />
-		<link rel="icon" href="%sveltekit.assets%/favicon.svg" />
+		<link rel="icon" href="favicon.ico" sizes="48x48" />
+		<link rel="icon" href="favicon.svg" sizes="any" type="image/svg+xml" />
+
+		<link rel="apple-touch-icon" href="apple-touch-icon-180x180.png" />
+
+		<link rel="manifest" href="./manifest.webmanifest" />
+
 		<meta name="viewport" content="width=device-width, initial-scale=1" />
 		%sveltekit.head%
 	</head>
+
 	<body data-sveltekit-preload-data="hover">
 		<div style="display: contents">%sveltekit.body%</div>
 	</body>
@@ -20,6 +20,8 @@
 	import { ColorMode } from '$lib/enums/ui.enums';
 	import { fade } from 'svelte/transition';
 	import { goto } from '$app/navigation';
+	import { Button } from '$lib/components/ui/button';
+	import { RefreshCw } from '@lucide/svelte';
 	import { page } from '$app/state';
 	import { setChatSettingsConfigContext } from '$lib/contexts';
 	import { settingsReferrer } from '$lib/stores/settings-referrer.svelte';
@@ -164,6 +166,15 @@
 								onConfigChange={handleConfigChange}
 								onThemeChange={handleThemeChange}
 							/>
+
+							{#if currentSection.title === SETTINGS_SECTION_TITLES.GENERAL}
+								<div class="flex justify-end">
+									<Button variant="outline" onclick={() => window.location.reload()}>
+										<RefreshCw class="h-3 w-3" />
+										Reload app
+									</Button>
+								</div>
+							{/if}
 						</div>
 					{/if}
 				</div>
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import { APPLE_META_TAGS, MEDIA_QUERIES, THEME_COLORS } from '$lib/constants/pwa';
+	import { APP_NAME } from '$lib/constants';
+
+	let { appName = APP_NAME } = $props();
+</script>
+
+<svelte:head>
+	<!-- Theme color for light/dark modes -->
+	<meta name="theme-color" content={THEME_COLORS.LIGHT} media={MEDIA_QUERIES.PREFERS_LIGHT} />
+	<meta name="theme-color" content={THEME_COLORS.DARK} media={MEDIA_QUERIES.PREFERS_DARK} />
+
+	<!-- Apple mobile web app meta tags -->
+	<meta
+		name={APPLE_META_TAGS.MOBILE_WEB_APP_CAPABLE.name}
+		content={APPLE_META_TAGS.MOBILE_WEB_APP_CAPABLE.content}
+	/>
+	<meta
+		name={APPLE_META_TAGS.STATUS_BAR_STYLE.name}
+		content={APPLE_META_TAGS.STATUS_BAR_STYLE.content}
+	/>
+	<meta name={APPLE_META_TAGS.MOBILE_WEB_APP_TITLE.name} content={appName} />
+</svelte:head>
@@ -0,0 +1,35 @@
+<script lang="ts">
+	import * as Card from '$lib/components/ui/card';
+	import { Button } from '$lib/components/ui/button';
+
+	let { needRefresh: needRefreshProp, updateServiceWorker, forceReload } = $props();
+	let needRefresh = $derived(needRefreshProp ?? false);
+</script>
+
+{#if needRefresh}
+	<Card.Root class="overflow-hidden gap-1 py-5">
+		<Card.Header class="px-5">
+			<Card.Title class="text-sm font-medium">Update available</Card.Title>
+		</Card.Header>
+
+		<Card.Content class="gap-6 grid px-5">
+			<p class="text-xs text-muted-foreground">A new version is available. Reload to update.</p>
+
+			<Button
+				class="justify-self-end-safe"
+				size="sm"
+				onclick={() => {
+					updateServiceWorker();
+
+					if (forceReload) {
+						window.location.reload();
+					}
+
+					needRefresh = false;
+				}}
+			>
+				Reload
+			</Button>
+		</Card.Content>
+	</Card.Root>
+{/if}
@@ -0,0 +1,2 @@
+export { default as PwaMetaTags } from './PwaMetaTags.svelte';
+export { default as PwaRefreshAlert } from './PwaRefreshAlert.svelte';
@@ -0,0 +1 @@
+export const APP_NAME = import.meta.env?.VITE_PUBLIC_APP_NAME || 'llama-ui';
@@ -1,4 +1,5 @@
 export const NEWLINE = '\n';
+export const TAB = '\t';
 export const DEFAULT_LANGUAGE = 'text';
 export const LANG_PATTERN = /^(\w*)\n?/;
 export const AMPERSAND_REGEX = /&/g;
@@ -3,6 +3,7 @@

 export * from './agentic';
 export * from './api-endpoints';
+export * from './app';
 export * from './attachment-labels';
 export * from './database';
 export * from './reasoning-effort';
@@ -36,6 +37,7 @@ export * from './message-export';
 export * from './model-id';
 export * from './precision';
 export * from './processing-info';
+export * from './pwa';
 export * from './routes';
 export * from './sandbox';
 export * from './settings-keys';
@@ -0,0 +1,30 @@
+/**
+ * JPEG and EXIF binary format constants for orientation parsing.
+ */
+
+/** Bytes of file prefix to scan, the APP1 EXIF segment sits near the start */
+export const EXIF_SCAN_BYTE_LIMIT = 128 * 1024;
+
+/** JPEG start of image marker */
+export const JPEG_SOI_MARKER = 0xffd8;
+
+/** APP1 segment marker byte, carries the EXIF payload */
+export const APP1_MARKER = 0xe1;
+
+/** Start of scan marker byte, compressed data begins and no EXIF follows */
+export const SOS_MARKER = 0xda;
+
+/** "Exif" signature opening the APP1 payload, big endian uint32 */
+export const EXIF_SIGNATURE = 0x45786966;
+
+/** TIFF byte order mark for little endian ("II") */
+export const TIFF_LITTLE_ENDIAN = 0x4949;
+
+/** TIFF magic number following the byte order mark */
+export const TIFF_MAGIC = 42;
+
+/** EXIF tag id holding the orientation value */
+export const EXIF_ORIENTATION_TAG = 0x0112;
+
+/** Size in bytes of one IFD directory entry */
+export const IFD_ENTRY_SIZE = 12;
@@ -0,0 +1,352 @@
+/**
+ * Centralized PWA constants to avoid magic strings, regexes, and duplicated
+ * definitions across the codebase.
+ */
+
+import { APP_NAME } from './app';
+
+export const MEDIA_QUERIES = {
+	PREFERS_DARK: '(prefers-color-scheme: dark)',
+	PREFERS_LIGHT: '(prefers-color-scheme: light)'
+} as const;
+
+export const THEME_COLORS = {
+	LIGHT: '#ffffff',
+	DARK: '#0d0d0d',
+	ACCENT_BLUE: '#2563eb',
+	ACCENT_BLUE_HOVER: '#1d4ed8',
+	BACKGROUND_LIGHT: 'white',
+	BACKGROUND_DARK: '#111111',
+	TITLE_UPDATE_ALERT: {
+		BORDER_LIGHT: 'zinc-200',
+		BORDER_DARK: 'zinc-700',
+		BG_LIGHT: 'white',
+		BG_DARK: 'zinc-800',
+		TEXT_LIGHT: 'zinc-500',
+		TEXT_DARK: 'zinc-400'
+	}
+} as const;
+
+export const FAVICON_PATHS = {
+	ICO_LIGHT: 'favicon.ico',
+	ICO_DARK: 'favicon-dark.ico',
+	SVG_LIGHT: 'favicon.svg',
+	SVG_DARK: 'favicon-dark.svg'
+} as const;
+
+export const FAVICON_SELECTORS = {
+	ICO_48X48: 'link[rel="icon"][sizes="48x48"]',
+	SVG_ANY: 'link[rel="icon"][type="image/svg+xml"]'
+} as const;
+
+export const APPLE_ASSETS = {
+	TOUCH_ICON: 'apple-touch-icon-180x180.png'
+} as const;
+
+export const PWA_MANIFEST = {
+	name: APP_NAME,
+	short_name: APP_NAME,
+	description: 'Local AI chat interface powered by llama.cpp',
+	start_url: './',
+	display: 'standalone' as const,
+	background_color: THEME_COLORS.BACKGROUND_LIGHT,
+	theme_color: THEME_COLORS.BACKGROUND_LIGHT,
+	icons: [
+		{ src: 'pwa-64x64.png', sizes: '64x64', type: 'image/png' },
+		{ src: 'pwa-192x192.png', sizes: '192x192', type: 'image/png' },
+		{ src: 'pwa-512x512.png', sizes: '512x512', type: 'image/png', purpose: 'any' as const },
+		{
+			src: 'maskable-icon-512x512.png',
+			sizes: '512x512',
+			type: 'image/png',
+			purpose: 'maskable' as const
+		}
+	]
+};
+
+export const PWA_ICON_PATHS = {
+	PWA_64: '/pwa-64x64.png',
+	PWA_192: '/pwa-192x192.png',
+	PWA_512: '/pwa-512x512.png',
+	MASKABLE_512: '/maskable-icon-512x512.png'
+} as const;
+
+/** Apple device dimensions (logical points) and DPR, from Apple HIG. */
+export const APPLE_DEVICES = {
+	// iPhones (DPR 3)
+	'1170x2532': { width: 390, height: 844, dpr: 3 }, // iPhone 13, 15
+	'1179x2556': { width: 393, height: 852, dpr: 3 }, // iPhone 14, 15 Pro, 16
+	'1206x2622': { width: 402, height: 874, dpr: 3 }, // iPhone 16 Plus, 16e
+	'1284x2778': { width: 428, height: 926, dpr: 3 }, // iPhone 15 Plus
+	'1290x2796': { width: 430, height: 932, dpr: 3 }, // iPhone 15 Pro Max, 16 Pro
+	'1320x2868': { width: 440, height: 956, dpr: 3 }, // iPhone 16 Pro Max
+	'750x1334': { width: 375, height: 667, dpr: 2 }, // iPhone 6/7/8, 14
+	'640x1136': { width: 320, height: 568, dpr: 2 }, // iPhone 6/7/8 Plus
+	// iPads (DPR 2)
+	'1668x2388': { width: 834, height: 1194, dpr: 2 }, // iPad Air 11", iPad 11"
+	'2048x2732': { width: 1024, height: 1366, dpr: 2 }, // iPad Pro 12.9"
+	'1640x2360': { width: 820, height: 1180, dpr: 2 }, // iPad Air 10.9"
+	'1032x1376': { width: 1032, height: 1376, dpr: 2 }, // iPad Air 13"
+	'744x1133': { width: 376, height: 573, dpr: 2 } // iPad mini 8.3"
+} as const;
+
+export type AppleDeviceKey = keyof typeof APPLE_DEVICES;
+
+export const PWA_FILE_PATHS = {
+	MANIFEST: '/manifest.webmanifest',
+	SERVICE_WORKER: '/sw.js',
+	VERSION: '/version.json',
+	WORKBOX: '/workbox-<hash>.js'
+} as const;
+
+// Used by the server middleware to skip API key validation.
+// Keep in sync with tools/server/server-http.cpp public_endpoints list.
+
+export const PUBLIC_ENDPOINTS = [
+	'/health',
+	'/v1/health',
+	'/models',
+	'/v1/models',
+	'/props',
+	'/metrics',
+	'/',
+	'/index.html',
+
+	'/favicon.ico',
+	'/favicon-dark.ico',
+	'/favicon.svg',
+	'/favicon-dark.svg',
+	'/pwa-64x64.png',
+	'/pwa-192x192.png',
+	'/pwa-512x512.png',
+	'/maskable-icon-512x512.png',
+	'/apple-touch-icon-180x180.png',
+	'/apple-splash-portrait-640x1136.png',
+	'/apple-splash-landscape-640x1136.png',
+	'/apple-splash-portrait-750x1334.png',
+	'/apple-splash-landscape-750x1334.png',
+	'/apple-splash-portrait-1170x2532.png',
+	'/apple-splash-landscape-1170x2532.png',
+	'/apple-splash-portrait-1179x2556.png',
+	'/apple-splash-landscape-1179x2556.png',
+	'/apple-splash-portrait-1206x2622.png',
+	'/apple-splash-landscape-1206x2622.png',
+	'/apple-splash-portrait-1284x2778.png',
+	'/apple-splash-landscape-1284x2778.png',
+	'/apple-splash-portrait-1290x2796.png',
+	'/apple-splash-landscape-1290x2796.png',
+	'/apple-splash-portrait-1320x2868.png',
+	'/apple-splash-landscape-1320x2868.png',
+	'/apple-splash-portrait-1488x2266.png',
+	'/apple-splash-landscape-1488x2266.png',
+	'/apple-splash-portrait-1640x2360.png',
+	'/apple-splash-landscape-1640x2360.png',
+	'/apple-splash-portrait-1668x2388.png',
+	'/apple-splash-landscape-1668x2388.png',
+	'/apple-splash-portrait-2048x2732.png',
+	'/apple-splash-landscape-2048x2732.png',
+	'/apple-splash-portrait-dark-640x1136.png',
+	'/apple-splash-landscape-dark-640x1136.png',
+	'/apple-splash-portrait-dark-750x1334.png',
+	'/apple-splash-landscape-dark-750x1334.png',
+	'/apple-splash-portrait-dark-1170x2532.png',
+	'/apple-splash-landscape-dark-1170x2532.png',
+	'/apple-splash-portrait-dark-1179x2556.png',
+	'/apple-splash-landscape-dark-1179x2556.png',
+	'/apple-splash-portrait-dark-1206x2622.png',
+	'/apple-splash-landscape-dark-1206x2622.png',
+	'/apple-splash-portrait-dark-1284x2778.png',
+	'/apple-splash-landscape-dark-1284x2778.png',
+	'/apple-splash-portrait-dark-1290x2796.png',
+	'/apple-splash-landscape-dark-1290x2796.png',
+	'/apple-splash-portrait-dark-1320x2868.png',
+	'/apple-splash-landscape-dark-1320x2868.png',
+	'/apple-splash-portrait-dark-1488x2266.png',
+	'/apple-splash-landscape-dark-1488x2266.png',
+	'/apple-splash-portrait-dark-1640x2360.png',
+	'/apple-splash-landscape-dark-1640x2360.png',
+	'/apple-splash-portrait-dark-1668x2388.png',
+	'/apple-splash-landscape-dark-1668x2388.png',
+	'/apple-splash-portrait-dark-2048x2732.png',
+	'/apple-splash-landscape-dark-2048x2732.png',
+	'/manifest.webmanifest',
+	'/sw.js',
+	'/version.json',
+	'/workbox-<hash>.js'
+] as const;
+export const BUILD_CONFIG = {
+	OUTPUT_DIR: './dist',
+	GUIDE_COMMENT: `
+<!--
+  This is a static build of the frontend.
+  It is automatically generated by the build process.
+  Do not edit this file directly.
+  To make changes, refer to the "Web UI" section in the README.
+-->
+`.trim()
+} as const;
+
+export const REGEX_PATTERNS = {
+	SPLASH_FILE: /^apple-splash-(portrait|landscape)-(dark-)?(\d+)x(\d+)\.png$/,
+	HEAD_CLOSE: /\t*<\/head>/
+} as const;
+
+// Device names used by @vite-pwa/assets-generator for splash screen generation.
+// Keep in sync with pwa-assets.config.ts.
+export const PWA_GENERATOR_DEVICES = [
+	'iPhone 13',
+	'iPhone 13 Pro',
+	'iPhone 13 Pro Max',
+	'iPhone 14',
+	'iPhone 14 Plus',
+	'iPhone 14 Pro',
+	'iPhone 14 Pro Max',
+	'iPhone 15',
+	'iPhone 15 Plus',
+	'iPhone 15 Pro',
+	'iPhone 15 Pro Max',
+	'iPhone 16',
+	'iPhone 16 Plus',
+	'iPhone 16 Pro',
+	'iPhone 16 Pro Max',
+	'iPhone 16e',
+	'iPhone SE 4"',
+	'iPhone SE 4.7"',
+	'iPad 11"',
+	'iPad Air 10.9"',
+	'iPad Air 11"',
+	'iPad Air 13"',
+	'iPad Pro 11"',
+	'iPad Pro 12.9"',
+	'iPad mini 8.3"'
+] as const;
+
+// PWA assets generator configuration — used by pwa-assets.config.ts
+export const PWA_ASSET_GENERATOR = {
+	LINK_PRESET: '2023',
+	SPLASH_PADDING: 0.75,
+	FIT_MODE: 'contain',
+	ADD_MEDIA_SCREEN: true,
+	BASE_PATH: './',
+	XHTML: false,
+	PNG_COMPRESSION_LEVEL: 9,
+	PNG_QUALITY: 60,
+	DARK_PREFIX: 'dark-'
+} as const;
+
+export const CACHE_SETTINGS = {
+	IMMUTABLE_MAX_AGE_SECONDS: 31536000,
+	API_CACHE_MAX_AGE_SECONDS: 60 * 60 * 24,
+	API_CACHE_MAX_ENTRIES: 50,
+	MAX_FILE_SIZE_BYTES: 10 * 1024 * 1024
+} as const;
+
+export const GLOB_PATTERNS: string[] = [
+	'**/*.{js,css,html,ico,svg,png,webp,woff,woff2,json,webmanifest}'
+];
+
+// loading.html is the model loading page served by llama-server itself.
+// The SvelteKit PWA manifest transform strips the html extension from every
+// precache entry to match clean URLs, but loading.html is a plain static asset
+// with no clean URL, so static servers answer 404 and the SW install fails.
+export const GLOB_IGNORES: string[] = ['**/loading.html'];
+
+export const SW_CONFIG = {
+	CHECK_INTERVAL_MS: 60000,
+	UPDATE_FETCH_OPTIONS: {
+		CACHE: 'no-store',
+		HEADERS: {
+			CACHE: 'no-store',
+			CACHE_CONTROL: 'no-cache'
+		}
+	}
+} as const;
+
+// Runtime caching configuration for Workbox
+export const RUNTIME_CACHING = {
+	HANDLER: 'NetworkFirst',
+	CACHE_NAME: 'api-cache'
+} as const;
+
+// Workbox runtime caching patterns
+export const API_CACHING_PATTERNS = {
+	V1_API: /^\/v1\/.*/,
+	STATIC_API: /^\/(health|props|models|tools|slots|cors-proxy).*/
+} as const;
+
+// SvelteKit PWA plugin options
+export const PWA_KIT_OPTIONS = {
+	NAVIGATE_FALLBACK: './'
+} as const;
+
+export const APPLE_META_TAGS = {
+	MOBILE_WEB_APP_CAPABLE: { name: 'apple-mobile-web-app-capable', content: 'yes' },
+	STATUS_BAR_STYLE: { name: 'apple-mobile-web-app-status-bar-style', content: 'black-translucent' },
+	MOBILE_WEB_APP_TITLE: { name: 'apple-mobile-web-app-title' }
+} as const;
+
+// Splash screen HTML link tag prefix used by generateSplashScreenLinks
+export const SPLASH_LINK = {
+	HTML: '<link rel="apple-touch-startup-image"',
+	DARK_MEDIA_SUFFIX: ' and (prefers-color-scheme: dark)'
+} as const;
+
+// SvelteKit PWA plugin configuration — used by @vite.config.ts
+import type { SvelteKitPWAOptions } from '@vite-pwa/sveltekit';
+
+export const SVELTEKIT_PWA_OPTIONS: SvelteKitPWAOptions = {
+	// Strategy: generateSW - the plugin generates a service worker automatically
+	// using Workbox. For a custom SW, use 'injectManifest' instead.
+	// Manifest configuration
+	manifest: PWA_MANIFEST,
+
+	// Workbox configuration for generateSW strategy
+	workbox: {
+		// Match all static assets in the build output.
+		// Uses '**/' because SvelteKit outputs files under _app/immutable/
+		// subdirectories.
+		globPatterns: GLOB_PATTERNS,
+		globIgnores: GLOB_IGNORES,
+		maximumFileSizeToCacheInBytes: CACHE_SETTINGS.MAX_FILE_SIZE_BYTES,
+
+		// Runtime caching for API calls - use NetworkFirst so APIs are always fresh
+		runtimeCaching: [
+			{
+				urlPattern: API_CACHING_PATTERNS.V1_API,
+				handler: RUNTIME_CACHING.HANDLER,
+				options: {
+					cacheName: RUNTIME_CACHING.CACHE_NAME,
+					expiration: {
+						maxEntries: CACHE_SETTINGS.API_CACHE_MAX_ENTRIES,
+						maxAgeSeconds: CACHE_SETTINGS.API_CACHE_MAX_AGE_SECONDS
+					}
+				}
+			},
+			{
+				urlPattern: API_CACHING_PATTERNS.STATIC_API,
+				handler: RUNTIME_CACHING.HANDLER,
+				options: {
+					cacheName: RUNTIME_CACHING.CACHE_NAME,
+					expiration: {
+						maxEntries: CACHE_SETTINGS.API_CACHE_MAX_ENTRIES,
+						maxAgeSeconds: CACHE_SETTINGS.API_CACHE_MAX_AGE_SECONDS
+					}
+				}
+			}
+		]
+	},
+
+	devOptions: {
+		enabled: true,
+		suppressWarnings: true,
+		// Use PWA_KIT_OPTIONS.NAVIGATE_FALLBACK to match production SW behaviour
+		// (navigateFallback defaults to the configured base path, which is '/' for this SPA).
+		navigateFallback: PWA_KIT_OPTIONS.NAVIGATE_FALLBACK
+	},
+
+	// SvelteKit-specific options
+	kit: {
+		// Include version file for proper cache invalidation
+		includeVersionFile: true
+	}
+};
@@ -31,6 +31,7 @@ export const SETTINGS_KEYS = {
 	SHOW_RAW_MODEL_NAMES: 'showRawModelNames',
 	SHOW_MODEL_QUANTIZATION: 'showModelQuantization',
 	SHOW_MODEL_TAGS: 'showModelTags',
+	SHOW_BUILD_VERSION: 'showBuildVersion',
 	SHOW_SYSTEM_MESSAGE: 'showSystemMessage',
 	// Sampling
 	TEMPERATURE: 'temperature',
@@ -365,6 +365,14 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 					serverKey: SETTINGS_KEYS.ALWAYS_SHOW_AGENTIC_TURNS,
 					paramType: SyncableParameterType.BOOLEAN
 				}
+			},
+			{
+				key: SETTINGS_KEYS.SHOW_BUILD_VERSION,
+				label: 'Show build version information',
+				help: 'Display the current build version in the bottom-right corner of the interface.',
+				defaultValue: false,
+				type: SettingsFieldType.CHECKBOX,
+				section: SETTINGS_SECTION_SLUGS.DISPLAY
 			}
 		]
 	},
@@ -40,6 +40,9 @@ export const DEPRECATED_MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = `${STORAGE_APP_NA
 /** @deprecated Use {@link USER_OVERRIDES_LOCALSTORAGE_KEY} instead */
 export const DEPRECATED_USER_OVERRIDES_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME_DEPRECATED}.userOverrides`;

+/** Build version stored in localStorage for non-PWA update detection */
+export const BUILD_VERSION_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.buildVersion`;
+
 /** Maps new keys to their deprecated fallback keys */
 export const NEW_TO_DEPRECATED_MAP: Record<string, string> = {
 	[ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY]: DEPRECATED_ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY,
@@ -5,7 +5,6 @@ import { ROUTES } from './routes';

 export const FORK_TREE_DEPTH_PADDING = 8;
 export const SYSTEM_MESSAGE_PLACEHOLDER = 'System message';
-export const APP_NAME = import.meta.env.VITE_PUBLIC_APP_NAME || 'llama-ui';

 export const ICON_STRIP_TRANSITION_DURATION = 150;
 export const ICON_STRIP_TRANSITION_DELAY_MULTIPLIER = 50;
@@ -63,3 +63,5 @@ export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol }
 export { KeyboardKey } from './keyboard.enums';

 export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools.enums';
+
+export { SplashOrientation } from './splash.enums';
@@ -0,0 +1,7 @@
+/**
+ * Splash screen orientation for iOS apple-touch-startup-image
+ */
+export enum SplashOrientation {
+	PORTRAIT = 'portrait',
+	LANDSCAPE = 'landscape'
+}
@@ -0,0 +1,80 @@
+import { browser } from '$app/environment';
+import { useRegisterSW } from 'virtual:pwa-register/svelte';
+import { versionStore } from '$lib/stores/version.svelte';
+import { BUILD_VERSION_LOCALSTORAGE_KEY } from '$lib/constants/storage';
+import { SW_CONFIG } from '$lib/constants/pwa';
+
+/**
+ * Hook for PWA service worker registration, update polling, and build version mismatch detection.
+ *
+ * Combines two concerns that always belong together:
+ * 1. SW registration with periodic polling for updates
+ * 2. localStorage-based version tracking for non-PWA users
+ */
+export function usePwa() {
+	let swCheckInterval: ReturnType<typeof setInterval> | null = null;
+	let needRefreshByStorage = $state(false);
+
+	const {
+		// offlineReady, // to do - add installation banners for iOS
+		needRefresh: pwaNeedRefresh,
+		updateServiceWorker
+	} = useRegisterSW({
+		onRegisteredSW(swUrl: string, r: ServiceWorkerRegistration | undefined) {
+			if (swCheckInterval) {
+				clearInterval(swCheckInterval);
+			}
+			swCheckInterval = setInterval(async () => {
+				if (!r || r.installing || !navigator?.onLine) return;
+
+				try {
+					const resp = await fetch(swUrl, {
+						cache: SW_CONFIG.UPDATE_FETCH_OPTIONS.CACHE,
+						headers: {
+							cache: SW_CONFIG.UPDATE_FETCH_OPTIONS.HEADERS.CACHE,
+							'cache-control': SW_CONFIG.UPDATE_FETCH_OPTIONS.HEADERS.CACHE_CONTROL
+						}
+					});
+					if (resp?.status === 200) {
+						await r.update();
+					}
+				} catch (e) {
+					console.error(e);
+				}
+			}, SW_CONFIG.CHECK_INTERVAL_MS);
+		},
+		onRegisterError(error: unknown) {
+			console.error('[PWA] SW registration error:', error);
+		}
+	});
+
+	// Detect version mismatch via localStorage.
+	// _app/version.json is SvelteKit's native version file for PWA cache invalidation.
+	// This comparison detects server upgrades for non-PWA users.
+	$effect(() => {
+		if (!browser) return;
+
+		const currentVersion = versionStore.value;
+		if (!currentVersion) return;
+
+		try {
+			const storedVersion = localStorage.getItem(BUILD_VERSION_LOCALSTORAGE_KEY);
+			needRefreshByStorage = !!storedVersion && storedVersion !== currentVersion;
+			localStorage.setItem(BUILD_VERSION_LOCALSTORAGE_KEY, currentVersion);
+		} catch {
+			needRefreshByStorage = false;
+		}
+	});
+
+	return {
+		/** Writable that is true when a PWA service worker update is available */
+		get needRefresh() {
+			return pwaNeedRefresh;
+		},
+		updateServiceWorker,
+		/** Version mismatch detected via localStorage (non-PWA users) */
+		get needRefreshByStorage() {
+			return needRefreshByStorage;
+		}
+	};
+}
@@ -34,7 +34,6 @@ import type {
 import { modelsStore } from '$lib/stores/models.svelte';
 import { settingsStore } from '../stores/settings.svelte';
 import { capImageDataURLSize } from '../utils/cap-img-size';
-import { MEGAPIXELS_TO_PIXELS } from '$lib/constants/image-size';

 function getAudioInputFormat(mimeType: string): AudioInputFormat {
 	const normalizedMimeType = mimeType.trim().toLowerCase();
@@ -961,10 +960,11 @@ export class ChatService {

 		for (const image of imageFiles) {
 			const maxImageResolution = settingsStore.getConfig(SETTINGS_KEYS.MAX_IMAGE_RESOLUTION);
-			let base64Url = image.base64Url;
-			if (maxImageResolution > 1 / MEGAPIXELS_TO_PIXELS) {
-				base64Url = await capImageDataURLSize(image.base64Url, maxImageResolution);
-			}
+
+			// Caps the resolution and bakes the jpeg exif orientation in one pass,
+			// untouched images pass through as is
+			const base64Url = await capImageDataURLSize(image.base64Url, maxImageResolution);
+
 			contentParts.push({
 				type: ContentPartType.IMAGE_URL,
 				image_url: { url: base64Url }
@@ -0,0 +1,42 @@
+/**
+ * buildInfoStore - llama.cpp build information
+ *
+ * Reads the build version from `build.json` — embedded at llama.cpp build time
+ * with the llama.cpp build number (LLAMA_BUILD_NUMBER). Shown in the UI when
+ * `showBuildVersion` is enabled.
+ *
+ * In dev mode (via `npm run dev`), falls back to `import.meta.env.DEV`'s truthy
+ * value since the artifact is not produced.
+ */
+
+import { browser } from '$app/environment';
+import { base } from '$app/paths';
+
+let build = $state<string>('');
+
+async function loadBuild() {
+	if (!browser) return;
+
+	if (import.meta.env.DEV) {
+		build = 'dev';
+		return;
+	}
+
+	try {
+		const res = await fetch(`${base}/build.json`, { cache: 'no-store' });
+		if (res.ok) {
+			const data = await res.json();
+			build = data.version ?? '';
+		}
+	} catch {
+		// build.json missing or unreachable - leave as empty string
+	}
+}
+
+loadBuild();
+
+export const buildInfoStore = {
+	get value(): string {
+		return build;
+	}
+};
@@ -489,7 +489,7 @@ class MCPStore {
 			if (!rootDomain) return null;

 			const origin = `${url.protocol}//${rootDomain}`;
-			const candidates = ['favicon.ico', 'favicon.svg', 'favicon.png'];
+			const candidates = ['favicon.ico', 'favicon.png'];

 			for (const path of candidates) {
 				const faviconUrl = `${origin}/${path}`;
@@ -0,0 +1,14 @@
+import { browser } from '$app/environment';
+import { MEDIA_QUERIES } from '$lib/constants';
+
+export const theme = $state({
+	isSystemDark: browser && window.matchMedia(MEDIA_QUERIES.PREFERS_DARK).matches
+});
+
+if (browser) {
+	const mql = window.matchMedia(MEDIA_QUERIES.PREFERS_DARK);
+
+	mql.addEventListener('change', (e) => {
+		theme.isSystemDark = e.matches;
+	});
+}
@@ -0,0 +1,41 @@
+/**
+ * versionStore - Frontend build version
+ *
+ * Reads from SvelteKit's `_app/version.json` — generated by the @vite-pwa/sveltekit
+ * plugin. The version string changes on every build, so comparing it against
+ * localStorage reliably detects server upgrades.
+ *
+ * In dev mode, falls back to `'dev'`.
+ */
+
+import { browser } from '$app/environment';
+import { base } from '$app/paths';
+
+let version = $state<string>('');
+
+async function loadVersion() {
+	if (!browser) return;
+
+	if (import.meta.env.DEV) {
+		version = 'dev';
+		return;
+	}
+
+	try {
+		const res = await fetch(`${base}/_app/version.json`, { cache: 'no-store' });
+		if (res.ok) {
+			const data = await res.json();
+			version = data.version ?? '';
+		}
+	} catch {
+		// _app/version.json missing or unreachable - leave as empty string
+	}
+}
+
+loadVersion();
+
+export const versionStore = {
+	get value(): string {
+		return version;
+	}
+};
@@ -165,3 +165,6 @@ export type { ToolEntry, ToolGroup } from './tools';

 // Reasoning
 export type { ReasoningEffortLevel } from './reasoning';
+
+// Splash
+export type { SplashDimensions } from './splash';
@@ -0,0 +1 @@
+export type SplashDimensions = { deviceW: number; deviceH: number; dpr: number };
@@ -1,11 +1,19 @@
 import { MEGAPIXELS_TO_PIXELS } from '$lib/constants/image-size';
 import { BASE64_IMAGE_URI_REGEX } from '$lib/constants/uri-template';
+import { getJpegOrientationFromDataURL, isJpegMimeType } from './jpeg-orientation';
 import { MimeTypeImage } from '$lib/enums';

 /**
 * Converts an Image base64 data URL to another Image data URL with capped dimensions to reduce file size.
+ *
+ * For JPEGs the EXIF orientation is baked into the pixels in the same canvas
+ * pass, the browser applies the rotation when decoding so naturalWidth and
+ * naturalHeight already describe the upright image. Backends decoding with
+ * stb_image ignore EXIF, see ggml-org/llama.cpp#20870. Images that need
+ * neither capping nor rotation pass through untouched, so at most one
+ * re-encode ever happens.
 * @param base64UrlImage - The Image base64 data URL to convert
- * @param maxMegapixels - The maximum image size in megapixels for the output Image
+ * @param maxMegapixels - The maximum image size in megapixels for the output Image, 0 disables capping
 * @returns Promise resolving to Image data URL
 */
 export function capImageDataURLSize(
@@ -26,6 +34,10 @@ export function capImageDataURLSize(
 				return reject(new Error(`Unsupported image MIME type: ${mimeType}`));
 			}

+			const orientation = isJpegMimeType(mimeType)
+				? getJpegOrientationFromDataURL(base64UrlImage)
+				: 1;
+
 			const img = new Image();

 			img.onload = () => {
@@ -46,6 +58,10 @@ export function capImageDataURLSize(
 						const scaleFactor = Math.sqrt(maxPixels / totalPixels);
 						canvas.width = Math.floor(targetWidth * scaleFactor);
 						canvas.height = Math.floor(targetHeight * scaleFactor);
+					} else if (orientation > 1) {
+						// No capping needed but the pixels still need the rotation baked in
+						canvas.width = targetWidth;
+						canvas.height = targetHeight;
 					} else {
 						return resolve(base64UrlImage);
 					}
@@ -0,0 +1,146 @@
+import {
+	EXIF_SCAN_BYTE_LIMIT,
+	JPEG_SOI_MARKER,
+	APP1_MARKER,
+	SOS_MARKER,
+	EXIF_SIGNATURE,
+	TIFF_LITTLE_ENDIAN,
+	TIFF_MAGIC,
+	EXIF_ORIENTATION_TAG,
+	IFD_ENTRY_SIZE
+} from '$lib/constants/jpeg-exif';
+import { MimeTypeImage } from '$lib/enums';
+
+/**
+ * Read the EXIF orientation tag from a JPEG base64 data URL
+ *
+ * Only a bounded prefix of the base64 payload is decoded, the APP1 segment
+ * always sits near the start of the file.
+ * @param base64UrlJpeg - The JPEG base64 data URL to inspect
+ * @returns The orientation value (1 to 8), or 1 when absent or unreadable
+ */
+export function getJpegOrientationFromDataURL(base64UrlJpeg: string): number {
+	try {
+		const payloadStart = base64UrlJpeg.indexOf(',') + 1;
+
+		if (payloadStart <= 0) {
+			return 1;
+		}
+
+		// Keep the slice a multiple of 4 characters so atob accepts it
+		const charLimit = Math.ceil(EXIF_SCAN_BYTE_LIMIT / 3) * 4;
+		const slice = base64UrlJpeg.slice(payloadStart, payloadStart + charLimit);
+		const binary = atob(slice.slice(0, slice.length - (slice.length % 4)));
+		const bytes = new Uint8Array(binary.length);
+
+		for (let i = 0; i < binary.length; i++) {
+			bytes[i] = binary.charCodeAt(i);
+		}
+
+		return findExifOrientation(new DataView(bytes.buffer));
+	} catch {
+		return 1;
+	}
+}
+
+/**
+ * Walk the JPEG segments of a header buffer looking for the APP1 EXIF block
+ * @param view - DataView over the JPEG header bytes
+ * @returns The orientation value (1 to 8), or 1 when absent or malformed
+ */
+function findExifOrientation(view: DataView): number {
+	if (view.byteLength < 4 || view.getUint16(0) !== JPEG_SOI_MARKER) {
+		return 1;
+	}
+
+	let offset = 2;
+
+	while (offset + 4 <= view.byteLength) {
+		if (view.getUint8(offset) !== 0xff) {
+			return 1;
+		}
+
+		const marker = view.getUint8(offset + 1);
+
+		// Compressed image data starts here: no EXIF past this point
+		if (marker === SOS_MARKER) {
+			return 1;
+		}
+
+		const segmentLength = view.getUint16(offset + 2);
+
+		if (marker === APP1_MARKER) {
+			return parseExifOrientation(view, offset + 4, segmentLength);
+		}
+
+		offset += 2 + segmentLength;
+	}
+
+	return 1;
+}
+
+/**
+ * Parse the orientation tag from an APP1 EXIF payload
+ * @param view - DataView over the JPEG header bytes
+ * @param start - Offset of the APP1 payload, right after the segment length
+ * @param segmentLength - Declared APP1 segment length
+ * @returns The orientation value (1 to 8), or 1 when absent or malformed
+ */
+function parseExifOrientation(view: DataView, start: number, segmentLength: number): number {
+	const end = Math.min(start + segmentLength, view.byteLength);
+
+	// The payload opens with the "Exif\0\0" signature
+	if (
+		start + 6 > end ||
+		view.getUint32(start) !== EXIF_SIGNATURE ||
+		view.getUint16(start + 4) !== 0
+	) {
+		return 1;
+	}
+
+	const tiff = start + 6;
+
+	if (tiff + 8 > end) {
+		return 1;
+	}
+
+	const littleEndian = view.getUint16(tiff) === TIFF_LITTLE_ENDIAN;
+
+	if (view.getUint16(tiff + 2, littleEndian) !== TIFF_MAGIC) {
+		return 1;
+	}
+
+	const ifdOffset = view.getUint32(tiff + 4, littleEndian);
+
+	if (tiff + ifdOffset + 2 > end) {
+		return 1;
+	}
+
+	const entryCount = view.getUint16(tiff + ifdOffset, littleEndian);
+
+	// Scan IFD0 entries for the orientation tag
+	for (let i = 0; i < entryCount; i++) {
+		const entry = tiff + ifdOffset + 2 + i * IFD_ENTRY_SIZE;
+
+		if (entry + IFD_ENTRY_SIZE > end) {
+			return 1;
+		}
+
+		if (view.getUint16(entry, littleEndian) === EXIF_ORIENTATION_TAG) {
+			const orientation = view.getUint16(entry + 8, littleEndian);
+
+			return orientation >= 1 && orientation <= 8 ? orientation : 1;
+		}
+	}
+
+	return 1;
+}
+
+/**
+ * Check if a MIME type represents a JPEG
+ * @param mimeType - The MIME type to check
+ * @returns True if the MIME type is a JPEG variant
+ */
+export function isJpegMimeType(mimeType: string): boolean {
+	return mimeType === MimeTypeImage.JPEG || mimeType === MimeTypeImage.JPG;
+}
@@ -57,7 +57,7 @@ export async function convertPDFToText(file: File): Promise<string> {

 	try {
 		const buffer = await getFileAsBuffer(file);
-		const pdf = await pdfjs.getDocument(buffer).promise;
+		const pdf = await pdfjs.getDocument({ data: buffer }).promise;
 		const numPages = pdf.numPages;

 		const textContentPromises: Promise<TextContent>[] = [];
@@ -94,7 +94,7 @@ export async function convertPDFToImage(file: File, scale: number = 1.5): Promis

 	try {
 		const buffer = await getFileAsBuffer(file);
-		const doc = await pdfjs.getDocument(buffer).promise;
+		const doc = await pdfjs.getDocument({ data: buffer }).promise;
 		const pages: Promise<string>[] = [];

 		for (let i = 1; i <= doc.numPages; i++) {
@@ -13,6 +13,8 @@
 		DialogConversationTitleUpdate,
 		SidebarNavigation
 	} from '$lib/components/app';
+	import { PwaMetaTags, PwaRefreshAlert } from '$lib/components/pwa';
+	import { pwaAssetsHead } from 'virtual:pwa-assets/head';

 	import { conversationsStore } from '$lib/stores/conversations.svelte';
 	import * as Sidebar from '$lib/components/ui/sidebar/index.js';
@@ -26,10 +28,16 @@
 	import { modelsStore } from '$lib/stores/models.svelte';
 	import { mcpStore } from '$lib/stores/mcp.svelte';
 	import { TOOLTIP_DELAY_DURATION } from '$lib/constants';
+	import { FAVICON_PATHS, FAVICON_SELECTORS } from '$lib/constants/pwa';
 	import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte';
+	import { usePwa } from '$lib/hooks/use-pwa.svelte';
 	import { useSettingsNavigation } from '$lib/hooks/use-settings-navigation.svelte';
 	import { conversations } from '$lib/stores/conversations.svelte';
 	import { isMobile } from '$lib/stores/viewport.svelte';
+	import { theme } from '$lib/stores/theme.svelte';
+	import { buildInfoStore } from '$lib/stores/build-info.svelte';
+
+	import { SETTINGS_KEYS } from '$lib/constants';

 	let { children } = $props();
 	let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
@@ -46,11 +54,31 @@
 		  }
 		| undefined = $state();

+	let showBuildVersion = $derived(config()[SETTINGS_KEYS.SHOW_BUILD_VERSION] as boolean);
+
 	let titleUpdateDialogOpen = $state(false);
 	let titleUpdateCurrentTitle = $state('');
 	let titleUpdateNewTitle = $state('');
 	let titleUpdateResolve: ((value: boolean) => void) | null = null;
+
 	const panelNav = useSettingsNavigation();
+	// Keep the hook object intact: destructuring needRefreshByStorage reads the getter once and freezes it
+	const pwa = usePwa();
+	const { needRefresh, updateServiceWorker } = pwa;
+
+	function updateFavicon() {
+		const dark = theme.isSystemDark;
+
+		let icoLink = document.querySelector(FAVICON_SELECTORS.ICO_48X48) as HTMLLinkElement | null;
+		if (icoLink) {
+			icoLink.href = dark ? FAVICON_PATHS.ICO_DARK : FAVICON_PATHS.ICO_LIGHT;
+		}
+
+		let svgLink = document.querySelector(FAVICON_SELECTORS.SVG_ANY) as HTMLLinkElement | null;
+		if (svgLink) {
+			svgLink.href = dark ? FAVICON_PATHS.SVG_DARK : FAVICON_PATHS.SVG_LIGHT;
+		}
+	}

 	function navigateToConversation(direction: -1 | 1) {
 		const allConvs = conversations();
@@ -137,9 +165,16 @@
 	}

 	onMount(() => {
+		updateFavicon();
 		mounted = true;
 	});

+	$effect(() => {
+		void theme.isSystemDark;
+
+		updateFavicon();
+	});
+
 	$effect(() => {
 		if (alwaysShowSidebarOnDesktop && isDesktop) {
 			sidebarOpen = true;
@@ -236,13 +271,36 @@
 </script>

 <svelte:head>
+	{#if pwaAssetsHead.themeColor}
+		<meta name="theme-color" content={pwaAssetsHead.themeColor.content} />
+	{/if}
+
 	{#if config().customCss}
 		<style use:customCss></style>
 	{/if}
+
+	{#each pwaAssetsHead.links as link (link.href)}
+		<link {...link} />
+	{/each}
+
+	<PwaMetaTags />
 </svelte:head>

+<!-- PWA update prompt + version -->
+<div class="fixed right-4 bottom-4 z-[9999] flex flex-col items-end gap-1">
+	{#if showBuildVersion && buildInfoStore.value}
+		<span class="text-[10px] tabular-nums text-muted-foreground">{buildInfoStore.value}</span>
+	{/if}
+	<PwaRefreshAlert
+		needRefresh={$needRefresh || pwa.needRefreshByStorage}
+		forceReload={pwa.needRefreshByStorage}
+		{updateServiceWorker}
+	/>
+</div>
+
 <Tooltip.Provider delayDuration={TOOLTIP_DELAY_DURATION}>
 	<ModeWatcher />
+
 	<Toaster richColors />

 	<DialogConversationTitleUpdate
@@ -254,7 +312,7 @@
 	/>

 	<Sidebar.Provider bind:open={sidebarOpen}>
-		<div class="flex h-dvh w-full">
+		<div class="flex h-screen w-full">
 			<Sidebar.Root variant="floating" class="h-full"
 				><SidebarNavigation bind:this={chatSidebar} /></Sidebar.Root
 			>
@@ -285,9 +343,9 @@
 				/>
 			{/if}

-			<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden"
-				>{@render children?.()}</Sidebar.Inset
-			>
+			<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
+				{@render children?.()}
+			</Sidebar.Inset>
 		</div>
 	</Sidebar.Provider>
 </Tooltip.Provider>
@@ -0,0 +1,14 @@
+<svg width="512" height="512" viewBox="0 0 512 512" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_29_291)">
+<path d="M244.95 8C215.233 8 187.774 23.8591 172.923 49.5999L95.6009 183.625C60.2162 244.959 104.481 321.6 175.29 321.6H208L316.977 132.708C348.959 77.2719 308.95 8 244.95 8ZM208 321.6H351.947C415.982 321.6 456.013 390.91 424.013 446.377C409.155 472.132 381.681 488 351.947 488H271.29C200.481 488 156.216 411.359 191.601 350.026L208 321.6Z" fill="#FAFAFA"/>
+<path d="M208 321.6H16L106.462 164.8L208 321.6Z" fill="#FAFAFA"/>
+<path d="M388.923 8L208 321.6L253.6 8H388.923Z" fill="#FAFAFA"/>
+<path d="M304 488H112L202.462 331.2L304 488Z" fill="#FAFAFA"/>
+<path d="M496 321.6H208L419.399 454.4L496 321.6Z" fill="#FAFAFA"/>
+</g>
+<defs>
+<clipPath id="clip0_29_291">
+<rect width="512" height="512" fill="white"/>
+</clipPath>
+</defs>
+</svg>
@@ -1 +1,14 @@
-<svg width="256" xmlns="http://www.w3.org/2000/svg" height="256" id="screenshot-ef94fbb0-dbab-80ed-8006-89429900edbf" viewBox="0 0 256 256" xmlns:xlink="http://www.w3.org/1999/xlink" fill="none" version="1.1"><g id="shape-ef94fbb0-dbab-80ed-8006-89429900edbf" rx="0" ry="0"><g id="shape-ef94fbb0-dbab-80ed-8006-894215755c3a"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-894215755c3a"><rect rx="0" ry="0" x="0" y="0" transform="matrix(1.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000)" width="256" height="256" style="fill: rgb(27, 31, 32); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef3f" rx="0" ry="0"><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef40"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef40"><path d="M171.66500854492188,99.5302505493164L159.79953002929688,120.62468719482422C144.15451049804688,108.58329010009766,120.9504165649414,106.8254165649414,105.3053970336914,119.7457504272461C80.0798110961914,140.57652282714844,81.8376235961914,188.7422637939453,121.1261978149414,189.00587463378906C132.11300659179688,189.00587463378906,141.42965698242188,183.8201141357422,151.44967651367188,180.39234924316406L156.72335815429688,201.3988494873047C147.84591674804688,205.52989196777344,138.79293823242188,209.7487335205078,129.03683471679688,211.06712341308594C40.08835220336914,223.1964569091797,45.18600845336914,94.78400421142578,125.6088638305664,88.10407257080078C142.48434448242188,86.69782257080078,157.33834838867188,91.09247589111328,171.75314331054688,99.5302505493164Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef41"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef41"><path d="M110.2272720336914,79.31470489501953C96.6918716430664,83.35785675048828,84.1232681274414,90.8288345336914,74.6305923461914,101.28812408447266C72.8727798461914,80.01782989501953,77.6188735961914,37.03793716430664,101.2621841430664,28.6001033782959C104.7780532836914,27.36964988708496,116.8195571899414,24.293371200561523,116.4679946899414,30.533788681030273C116.1161880493164,36.77426528930664,107.7663345336914,47.49722671508789,105.7450942993164,53.29823684692383C102.2292251586914,63.49386978149414,105.4811782836914,70.52535247802734,110.3154067993164,79.40265655517578Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef42"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef42"><path d="M143.62692260742188,127.65621185302734L143.62692260742188,143.47706604003906L157.68991088867188,143.47706604003906L157.68991088867188,155.7821807861328L143.62692260742188,155.7821807861328L143.62692260742188,170.7240753173828L130.44284057617188,170.7240753173828L130.44284057617188,155.7821807861328L115.5009536743164,155.7821807861328L115.5009536743164,143.47706604003906L129.12448120117188,143.47706604003906L130.44284057617188,142.15867614746094L130.44284057617188,127.65621185302734L143.62692260742188,127.65621185302734Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef43"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef43"><path d="M191.96823120117188,127.65621185302734L191.96823120117188,142.15867614746094L193.28683471679688,143.47706604003906L206.91036987304688,143.47706604003906L206.91036987304688,155.7821807861328L191.96823120117188,155.7821807861328L191.96823120117188,170.7240753173828L178.78439331054688,170.7240753173828L178.78439331054688,155.7821807861328L164.72140502929688,155.7821807861328L164.72140502929688,143.47706604003906L178.78439331054688,143.47706604003906L178.78439331054688,127.65621185302734L191.96823120117188,127.65621185302734Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef44"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef44"><path d="M153.20748901367188,38.092655181884766C154.96554565429688,40.72946548461914,145.03341674804688,52.06770706176758,143.45114135742188,54.96817398071289C138.88082885742188,63.581790924072266,141.95700073242188,68.50382232666016,145.38473510742188,76.67792510986328C135.45285034179688,75.18372344970703,126.2240982055664,76.41425323486328,116.3798599243164,77.55683135986328C118.5773696899414,58.659732818603516,129.21261596679688,31.1490535736084,153.20748901367188,38.092655181884766Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g></g></g></svg>
+<svg width="512" height="512" viewBox="0 0 512 512" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_29_291)">
+<path d="M244.95 8C215.233 8 187.774 23.8591 172.923 49.5999L95.6009 183.625C60.2162 244.959 104.481 321.6 175.29 321.6H208L316.977 132.708C348.959 77.2719 308.95 8 244.95 8ZM208 321.6H351.947C415.982 321.6 456.013 390.91 424.013 446.377C409.155 472.132 381.681 488 351.947 488H271.29C200.481 488 156.216 411.359 191.601 350.026L208 321.6Z" fill="#111111"/>
+<path d="M208 321.6H16L106.462 164.8L208 321.6Z" fill="#111111"/>
+<path d="M388.923 8L208 321.6L253.6 8H388.923Z" fill="#111111"/>
+<path d="M304 488H112L202.462 331.2L304 488Z" fill="#111111"/>
+<path d="M496 321.6H208L419.399 454.4L496 321.6Z" fill="#111111"/>
+</g>
+<defs>
+<clipPath id="clip0_29_291">
+<rect width="512" height="512" fill="white"/>
+</clipPath>
+</defs>
+</svg>
@@ -29,9 +29,6 @@ const config = {
 		},
 		alias: {
 			$styles: 'src/styles'
-		},
-		version: {
-			name: 'llama-ui'
 		}
 	},

@@ -0,0 +1,121 @@
+import { describe, expect, it } from 'vitest';
+import { capImageDataURLSize } from '$lib/utils/cap-img-size';
+import { getJpegOrientationFromDataURL } from '$lib/utils/jpeg-orientation';
+
+// Real 64x32 jpegs generated with Pillow, quality 90. The upright picture is
+// four solid quadrants: top left red, top right green, bottom left blue,
+// bottom right yellow. For each exif value the stored pixels are inverse
+// transposed so a conforming decoder shows the upright picture, exactly like
+// a rotated smartphone photo.
+const EXIF1 = `data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/4QAiRXhpZgAATU0AKgAAAAgAAQESAAMAAAABAAEAAAAAAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAAgAEADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD50ooor8MP9UwooooA9uooor4I/wCcwKKKKAPhSiiiv+gM/qgKKKKAP3Vooor/AJdz+lQooooA/9k=`;
+const EXIF3 = `data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/4QAiRXhpZgAATU0AKgAAAAgAAQESAAMAAAABAAMAAAAAAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAAgAEADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD7Nooor/Mo/XQooooA/Cqiiiv+og/moKKKKAPuuiiiv+fw/lcKKKKAPEaKKK+9P+jMKKKKAP/Z`;
+const EXIF5 = `data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/4QAiRXhpZgAATU0AKgAAAAgAAQESAAMAAAABAAUAAAAAAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCABAACADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD50ooor8MP9UzwKiiiv9xj/HQ99ooor/Dk/wBizwKiiiv9xj/HQ+66KKK/5/D+Vz7qooor+ej/AE9PhWiiiv6FP8wj7qooor+ej/T0/9k=`;
+const EXIF6 = `data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/4QAiRXhpZgAATU0AKgAAAAgAAQESAAMAAAABAAYAAAAAAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCABAACADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDCooor+Tz+Hz7qooor+ej/AE9PhWiiiv6FP8wj7qooor+ej/T0/Keiiiv7CP72PAqKKK/3GP8AHQ99ooor/Dk/2LPAqKKK/wBxj/HQ/9k=`;
+const EXIF8 = `data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/4QAiRXhpZgAATU0AKgAAAAgAAQESAAMAAAABAAgAAAAAAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCABAACADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD896KKK/1TPhz32iiiv8OT/Ys8Cooor/cY/wAdD32iiiv8OT/Ys/Viiiiv49P4JPhWiiiv6FP8wj7qooor+ej/AE9PhWiiiv6FP8wj/9k=`;
+const NOEXIF = `data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAAgAEADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD50ooor8MP9UwooooA9uooor4I/wCcwKKKKAPhSiiiv+gM/qgKKKKAP3Vooor/AJdz+lQooooA/9k=`;
+
+const RED: Rgb = [255, 0, 0];
+const GREEN: Rgb = [0, 200, 0];
+const BLUE: Rgb = [0, 0, 255];
+const YELLOW: Rgb = [255, 220, 0];
+
+// Wide tolerance per channel, jpeg compression shifts solid colors a bit
+const COLOR_TOLERANCE = 70;
+
+// 0.000512 megapixels is 512 pixels, a quarter of the area of the 2048 pixel fixtures
+const QUARTER_AREA_MEGAPIXELS = 0.000512;
+
+type Rgb = [number, number, number];
+
+function loadImage(dataUrl: string): Promise<HTMLImageElement> {
+	return new Promise((resolve, reject) => {
+		const img = new Image();
+		img.onload = () => resolve(img);
+		img.onerror = () => reject(new Error('Failed to decode image.'));
+		img.src = dataUrl;
+	});
+}
+
+// Decodes a data URL and samples the center of each quadrant of the picture
+async function quadrantColors(dataUrl: string): Promise<Rgb[]> {
+	const img = await loadImage(dataUrl);
+	const canvas = document.createElement('canvas');
+	canvas.width = img.naturalWidth;
+	canvas.height = img.naturalHeight;
+	const ctx = canvas.getContext('2d')!;
+	ctx.drawImage(img, 0, 0);
+	const points = [
+		[0.25, 0.25],
+		[0.75, 0.25],
+		[0.25, 0.75],
+		[0.75, 0.75]
+	];
+	return points.map(([fx, fy]) => {
+		const d = ctx.getImageData(
+			Math.floor(canvas.width * fx),
+			Math.floor(canvas.height * fy),
+			1,
+			1
+		).data;
+		return [d[0], d[1], d[2]];
+	});
+}
+
+function expectUpright(colors: Rgb[]) {
+	const targets = [RED, GREEN, BLUE, YELLOW];
+	for (let i = 0; i < 4; i++) {
+		for (let c = 0; c < 3; c++) {
+			expect(Math.abs(colors[i][c] - targets[i][c])).toBeLessThan(COLOR_TOLERANCE);
+		}
+	}
+}
+
+describe('capImageDataURLSize orientation and capping', () => {
+	it('passes upright jpegs through untouched when capping is disabled', async () => {
+		expect(await capImageDataURLSize(EXIF1, 0)).toBe(EXIF1);
+		expect(await capImageDataURLSize(NOEXIF, 0)).toBe(NOEXIF);
+	});
+
+	it('passes upright jpegs through untouched when under the cap threshold', async () => {
+		expect(await capImageDataURLSize(EXIF1, 1)).toBe(EXIF1);
+	});
+
+	it.each([
+		['orientation 3', EXIF3],
+		['orientation 5', EXIF5],
+		['orientation 6', EXIF6],
+		['orientation 8', EXIF8]
+	])('bakes %s into upright pixels without capping', async (_label, fixture) => {
+		const result = await capImageDataURLSize(fixture, 0);
+
+		expect(result).not.toBe(fixture);
+
+		const img = await loadImage(result);
+
+		expect(img.naturalWidth).toBe(64);
+		expect(img.naturalHeight).toBe(32);
+		expectUpright(await quadrantColors(result));
+
+		// The re-encoded jpeg carries no orientation tag anymore
+		expect(getJpegOrientationFromDataURL(result)).toBe(1);
+	});
+
+	it('caps and bakes the orientation in a single output', async () => {
+		const result = await capImageDataURLSize(EXIF6, QUARTER_AREA_MEGAPIXELS);
+		const img = await loadImage(result);
+
+		expect(img.naturalWidth).toBe(32);
+		expect(img.naturalHeight).toBe(16);
+		expectUpright(await quadrantColors(result));
+		expect(getJpegOrientationFromDataURL(result)).toBe(1);
+	});
+
+	it('caps upright jpegs without disturbing the picture', async () => {
+		const result = await capImageDataURLSize(EXIF1, QUARTER_AREA_MEGAPIXELS);
+		const img = await loadImage(result);
+
+		expect(img.naturalWidth).toBe(32);
+		expect(img.naturalHeight).toBe(16);
+		expectUpright(await quadrantColors(result));
+	});
+});
@@ -1,7 +0,0 @@
-import { expect, test } from '@playwright/test';
-
-test('home page loads correctly', async ({ page }) => {
-	await page.goto('/');
-	// Wait for the greeting to become visible (stores need time to initialize)
-	await expect(page.locator('h1', { hasText: /Hello there/ })).toBeVisible();
-});
@@ -0,0 +1,106 @@
+import { expect, test } from '@playwright/test';
+
+test.describe('PWA Service Worker', () => {
+	test('service worker is registered', async ({ page }) => {
+		await page.goto('/');
+
+		const swURL = await page.evaluate(async () => {
+			const registration = await Promise.race([
+				// eslint-disable-next-line @typescript-eslint/ban-ts-comment
+				// @ts-ignore - type inference differs from browser runtime
+				navigator.serviceWorker.ready,
+				new Promise((_, reject) =>
+					setTimeout(() => reject(new Error('Service worker registration failed: timeout')), 15000)
+				)
+			]);
+			// @ts-expect-error registration is of type unknown
+			return registration.active?.scriptURL;
+		});
+
+		expect(swURL).toBeTruthy();
+		expect(swURL).toContain('/sw.js');
+	});
+
+	test('service worker has precache configured', async ({ page }) => {
+		await page.goto('/');
+
+		await page.evaluate(async () => {
+			await navigator.serviceWorker.ready;
+		});
+
+		const swActive = await page.evaluate(async () => {
+			const reg = await navigator.serviceWorker.ready;
+			return reg.active?.scriptURL ?? null;
+		});
+
+		expect(swActive).toBeTruthy();
+
+		const swResponse = await page.request.get(swActive!);
+		const swContent = await swResponse.text();
+
+		// Precache contains SvelteKit content-hashed bundle paths
+		expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
+		expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
+		expect(swContent).toMatch(/"manifest\.webmanifest"/);
+		expect(swContent).toMatch(/"_app\/version\.json"/);
+		expect(swContent).toMatch(/NavigationRoute/);
+		expect(swContent).toMatch(/api-cache/);
+	});
+
+	test('offline mode - page loads when offline after caching', async ({ browser }) => {
+		const context = await browser.newContext();
+		const offlinePage = await context.newPage();
+
+		await offlinePage.goto('/');
+		await offlinePage.waitForLoadState('networkidle');
+
+		await offlinePage.evaluate(async () => {
+			await navigator.serviceWorker.ready;
+		});
+
+		await offlinePage.waitForTimeout(2000);
+
+		await context.setOffline(true);
+		await offlinePage.goto('/');
+
+		const bodyText = await offlinePage.locator('body').textContent();
+		expect(bodyText).toBeTruthy();
+
+		await context.close();
+	});
+
+	test('version.json is accessible and contains version', async ({ page }) => {
+		const versionResponse = await page.request.get('/_app/version.json');
+		expect(versionResponse.ok()).toBeTruthy();
+
+		const versionData = await versionResponse.json();
+		expect(versionData).toHaveProperty('version');
+		expect(typeof versionData.version).toBe('string');
+		expect(versionData.version.length).toBeGreaterThan(0);
+	});
+
+	test('manifest.webmanifest is accessible and valid', async ({ page }) => {
+		const response = await page.request.get('/manifest.webmanifest');
+		expect(response.ok()).toBeTruthy();
+
+		const manifest = await response.json();
+		expect(manifest).toHaveProperty('name', 'llama-ui');
+		expect(manifest).toHaveProperty('short_name', 'llama-ui');
+		expect(manifest).toHaveProperty('start_url', './');
+		expect(manifest).toHaveProperty('display', 'standalone');
+		expect(manifest.icons).toBeTruthy();
+		expect(manifest.icons.length).toBeGreaterThan(0);
+	});
+
+	test('index.html contains content-hashed bundle references', async ({ page }) => {
+		const response = await page.request.get('/');
+		expect(response.ok()).toBeTruthy();
+
+		const html = await response.text();
+
+		// SvelteKit outputs content-hashed bundle names in _app/immutable/
+		expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
+		expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
+		expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"\)/);
+	});
+});
@@ -0,0 +1,57 @@
+<script module lang="ts">
+	import { defineMeta } from '@storybook/addon-svelte-csf';
+	import PwaRefreshAlert from '$lib/components/pwa/PwaRefreshAlert.svelte';
+	import { expect } from 'storybook/test';
+
+	const { Story } = defineMeta({
+		title: 'Components/PwaRefreshAlert',
+		component: PwaRefreshAlert,
+		parameters: {
+			layout: 'centered'
+		}
+	});
+</script>
+
+<Story
+	name="Default"
+	args={{ needRefresh: true, updateServiceWorker: () => console.log('reload') }}
+	play={async ({ canvas }) => {
+		const title = canvas.getByText('Update available');
+		await expect(title).toBeInTheDocument();
+
+		const description = canvas.getByText(/A new version is available/);
+		await expect(description).toBeInTheDocument();
+
+		const button = canvas.getByRole('button', { name: 'Reload' });
+		await expect(button).toBeInTheDocument();
+	}}
+/>
+
+<Story
+	name="Hidden"
+	args={{ needRefresh: false, updateServiceWorker: () => console.log('reload') }}
+	play={async ({ canvas }) => {
+		const title = canvas.queryByText('Update available');
+		await expect(title).not.toBeInTheDocument();
+	}}
+/>
+
+<Story
+	name="ClickReload"
+	args={{
+		needRefresh: true,
+		updateServiceWorker: () => console.log('reload')
+	}}
+	play={async ({ canvas, userEvent }) => {
+		const button = canvas.getByRole('button', { name: 'Reload' });
+		await expect(button).toBeInTheDocument();
+
+		await userEvent.click(button);
+
+		const title = canvas.queryByText('Update available');
+		await expect(title).not.toBeInTheDocument();
+
+		const reloadBtn = canvas.queryByRole('button', { name: 'Reload' });
+		await expect(reloadBtn).not.toBeInTheDocument();
+	}}
+/>
@@ -0,0 +1,100 @@
+import { describe, expect, it } from 'vitest';
+import { getJpegOrientationFromDataURL, isJpegMimeType } from '$lib/utils/jpeg-orientation';
+
+// Builds the TIFF payload of an APP1 segment holding a single IFD0 entry
+function buildTiff(littleEndian: boolean, tag: number, value: number): number[] {
+	const u16 = (v: number) => (littleEndian ? [v & 0xff, v >> 8] : [v >> 8, v & 0xff]);
+	const u32 = (v: number) =>
+		littleEndian
+			? [v & 0xff, (v >> 8) & 0xff, (v >> 16) & 0xff, (v >> 24) & 0xff]
+			: [(v >> 24) & 0xff, (v >> 16) & 0xff, (v >> 8) & 0xff, v & 0xff];
+
+	return [
+		...(littleEndian ? [0x49, 0x49] : [0x4d, 0x4d]),
+		...u16(42),
+		...u32(8),
+		...u16(1),
+		...u16(tag),
+		...u16(3),
+		...u32(1),
+		// SHORT value sits left justified in the 4 byte value field
+		...u16(value),
+		...u16(0),
+		...u32(0)
+	];
+}
+
+// Wraps a TIFF payload into a complete minimal JPEG data URL
+function buildJpegDataURL(tiff: number[] | null, prependApp0 = false): string {
+	const bytes: number[] = [0xff, 0xd8];
+
+	if (prependApp0) {
+		// JFIF APP0 segment, irrelevant content the parser walks over
+		bytes.push(0xff, 0xe0, 0x00, 0x07, 0x4a, 0x46, 0x49, 0x46, 0x00);
+	}
+
+	if (tiff) {
+		const payload = [0x45, 0x78, 0x69, 0x66, 0x00, 0x00, ...tiff];
+		const length = payload.length + 2;
+		bytes.push(0xff, 0xe1, length >> 8, length & 0xff, ...payload);
+	}
+
+	// SOS marker terminates the metadata scan
+	bytes.push(0xff, 0xda, 0x00, 0x02);
+
+	return `data:image/jpeg;base64,${btoa(String.fromCharCode(...bytes))}`;
+}
+
+describe('getJpegOrientationFromDataURL', () => {
+	it('returns the orientation from a little endian EXIF block', () => {
+		expect(getJpegOrientationFromDataURL(buildJpegDataURL(buildTiff(true, 0x0112, 6)))).toBe(6);
+	});
+
+	it('returns the orientation from a big endian EXIF block', () => {
+		expect(getJpegOrientationFromDataURL(buildJpegDataURL(buildTiff(false, 0x0112, 8)))).toBe(8);
+	});
+
+	it('walks over a leading APP0 segment', () => {
+		expect(getJpegOrientationFromDataURL(buildJpegDataURL(buildTiff(true, 0x0112, 3), true))).toBe(
+			3
+		);
+	});
+
+	it('returns 1 when the EXIF block holds no orientation tag', () => {
+		expect(getJpegOrientationFromDataURL(buildJpegDataURL(buildTiff(true, 0x0100, 6)))).toBe(1);
+	});
+
+	it('returns 1 when the orientation value is out of range', () => {
+		expect(getJpegOrientationFromDataURL(buildJpegDataURL(buildTiff(true, 0x0112, 9)))).toBe(1);
+	});
+
+	it('returns 1 when the JPEG has no APP1 segment', () => {
+		expect(getJpegOrientationFromDataURL(buildJpegDataURL(null, true))).toBe(1);
+	});
+
+	it('returns 1 for a payload that is not a JPEG', () => {
+		const png = btoa(String.fromCharCode(0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a));
+		expect(getJpegOrientationFromDataURL(`data:image/png;base64,${png}`)).toBe(1);
+	});
+
+	it('returns 1 for a truncated payload', () => {
+		const truncated = btoa(String.fromCharCode(0xff, 0xd8, 0xff));
+		expect(getJpegOrientationFromDataURL(`data:image/jpeg;base64,${truncated}`)).toBe(1);
+	});
+
+	it('returns 1 for a malformed data URL', () => {
+		expect(getJpegOrientationFromDataURL('not a data url')).toBe(1);
+	});
+});
+
+describe('isJpegMimeType', () => {
+	it('matches both JPEG MIME variants', () => {
+		expect(isJpegMimeType('image/jpeg')).toBe(true);
+		expect(isJpegMimeType('image/jpg')).toBe(true);
+	});
+
+	it('rejects other image MIME types', () => {
+		expect(isJpegMimeType('image/png')).toBe(false);
+		expect(isJpegMimeType('image/webp')).toBe(false);
+	});
+});
@@ -0,0 +1,195 @@
+import { existsSync, readFileSync, readdirSync } from 'node:fs';
+import { resolve } from 'node:path';
+import { describe, expect, it } from 'vitest';
+
+const DIST_DIR = resolve(__dirname, '../../dist');
+const distExists = existsSync(DIST_DIR);
+
+// PWA Build Output tests are integration tests that require a built dist/.
+// CI builds first then runs these tests; local devs should run `npm run build` or use `npm run test:pwa`.
+describe('PWA Build Output', () => {
+	if (!distExists) {
+		console.warn(`⚠ Skipping PWA Build Output tests - dist/ not found (run 'npm run build' first)`);
+		it('skipped - dist/ not found', () => {});
+		return;
+	}
+
+	const swContent = readFileSync(resolve(DIST_DIR, 'sw.js'), 'utf-8');
+	const indexContent = readFileSync(resolve(DIST_DIR, 'index.html'), 'utf-8');
+
+	describe('Core files exist', () => {
+		it('service worker (sw.js) exists', () => {
+			expect(existsSync(resolve(DIST_DIR, 'sw.js')), 'sw.js not found').toBeTruthy();
+		});
+
+		it('workbox library exists (hashed filename)', () => {
+			// SvelteKit generates workbox-{hash}.js files
+			const files = readdirSync(DIST_DIR).filter((f) => f.match(/^workbox-[^.]+\.js$/));
+			expect(files.length).toBeGreaterThan(0);
+		});
+
+		it('manifest.webmanifest exists', () => {
+			expect(
+				existsSync(resolve(DIST_DIR, 'manifest.webmanifest')),
+				'manifest.webmanifest not found'
+			).toBeTruthy();
+		});
+
+		it('SvelteKit bundle.js exists in _app/immutable/', () => {
+			// SvelteKit generates hashed bundle names in _app/immutable/
+			const appDir = resolve(DIST_DIR, '_app', 'immutable');
+			expect(existsSync(appDir), '_app/immutable/ not found').toBeTruthy();
+			const files = readdirSync(appDir).filter((f) => f.startsWith('bundle.') && f.endsWith('.js'));
+			expect(files.length).toBeGreaterThan(0);
+		});
+
+		it('SvelteKit bundle.css exists in _app/immutable/assets/', () => {
+			// SvelteKit generates hashed CSS bundles in _app/immutable/assets/
+			const cssDir = resolve(DIST_DIR, '_app', 'immutable', 'assets');
+			expect(existsSync(cssDir), '_app/immutable/assets/ not found').toBeTruthy();
+			const files = readdirSync(cssDir).filter(
+				(f) => f.startsWith('bundle.') && f.endsWith('.css')
+			);
+			expect(files.length).toBeGreaterThan(0);
+		});
+
+		it('version.json exists in _app/', () => {
+			// SvelteKit stores version.json in _app directory
+			expect(
+				existsSync(resolve(DIST_DIR, '_app', 'version.json')),
+				'_app/version.json not found'
+			).toBeTruthy();
+		});
+	});
+
+	describe('version.json content', () => {
+		it('has valid JSON with version field', () => {
+			const content = readFileSync(resolve(DIST_DIR, '_app', 'version.json'), 'utf-8');
+			const parsed = JSON.parse(content);
+			expect(parsed).toHaveProperty('version');
+			expect(typeof parsed.version).toBe('string');
+			expect(parsed.version.length).toBeGreaterThan(0);
+		});
+	});
+
+	describe('Service worker content', () => {
+		it('service worker has minified self.define format', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit's workbox-plugin-sveltekit produces a minified SW with self.define
+			expect(swContent).toMatch(/if\(!self.define\)/);
+		});
+
+		it('references hashed workbox file (SvelteKit build output)', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit's workbox-plugin-sveltekit references hashed workbox files
+			expect(swContent).toMatch(/define\(\["\.\/workbox-[a-zA-Z0-9]+"\]/);
+		});
+
+		it('precache contains SvelteKit bundle.js with content hash', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit uses content-hashed bundle names in _app/immutable/
+			expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
+		});
+
+		it('precache contains SvelteKit bundle.css with content hash', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit uses content-hashed CSS bundle names in _app/immutable/assets/
+			expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
+		});
+
+		it('precache contains _app/version.json', () => {
+			expect(swContent).toBeTruthy();
+			// SvelteKit stores version.json in _app directory
+			expect(swContent).toMatch(/"_app\/version\.json"/);
+		});
+
+		it('precache contains manifest.webmanifest', () => {
+			expect(swContent).toBeTruthy();
+			expect(swContent).toMatch(/"manifest\.webmanifest"/);
+		});
+
+		it('has navigation route registered', () => {
+			expect(swContent).toBeTruthy();
+			expect(swContent).toMatch(/NavigationRoute/);
+		});
+
+		it('has runtime caching for API routes', () => {
+			expect(swContent).toBeTruthy();
+			expect(swContent).toMatch(/api-cache/);
+			expect(swContent).toMatch(/NetworkFirst/);
+		});
+	});
+
+	describe('index.html content', () => {
+		it('has modulepreload link for SvelteKit bundle with content hash', () => {
+			expect(indexContent).toBeTruthy();
+			// SvelteKit generates hashed bundle names in _app/immutable/
+			expect(indexContent).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
+		});
+
+		it('has stylesheet link for SvelteKit bundle.css with content hash', () => {
+			expect(indexContent).toBeTruthy();
+			expect(indexContent).toMatch(
+				/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/
+			);
+		});
+
+		it('has dynamic import for SvelteKit bundle with content hash', () => {
+			expect(indexContent).toBeTruthy();
+			expect(indexContent).toMatch(
+				/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"\)/
+			);
+		});
+
+		it('has __sveltekit__ variable (SvelteKit adds hash suffix)', () => {
+			expect(indexContent).toBeTruthy();
+			// SvelteKit 2.x uses __sveltekit__ as base with random suffix
+			expect(indexContent).toMatch(/__sveltekit_[a-zA-Z0-9-]+/);
+		});
+
+		it('has PWA manifest link', () => {
+			expect(indexContent).toBeTruthy();
+			expect(indexContent).toMatch(/rel="manifest" href="(\.?\/)?manifest\.webmanifest"/);
+		});
+
+		it('has apple-touch-icon link', () => {
+			expect(indexContent).toBeTruthy();
+			expect(indexContent).toMatch(/rel="apple-touch-icon"/);
+		});
+
+		it('has _app paths for SvelteKit bundles', () => {
+			expect(indexContent).toBeTruthy();
+			// SvelteKit uses _app paths for hashed assets
+			expect(indexContent).toMatch(/_app\//);
+		});
+	});
+
+	describe('SvelteKit _app directory', () => {
+		it('_app directory exists (SvelteKit uses it for hashed assets)', () => {
+			expect(existsSync(resolve(DIST_DIR, '_app'))).toBeTruthy();
+		});
+	});
+
+	describe('Hashed workbox files', () => {
+		it('workbox-*.js files exist in dist root (SvelteKit build output)', () => {
+			const files = readdirSync(DIST_DIR).filter((f) => f.match(/^workbox-[^.]+\.js$/));
+			expect(files.length).toBeGreaterThan(0);
+		});
+	});
+
+	describe('Static assets', () => {
+		it('has favicon.ico', () => {
+			expect(existsSync(resolve(DIST_DIR, 'favicon.ico'))).toBeTruthy();
+		});
+
+		it('has PWA icons', () => {
+			expect(existsSync(resolve(DIST_DIR, 'pwa-64x64.png'))).toBeTruthy();
+			expect(existsSync(resolve(DIST_DIR, 'pwa-192x192.png'))).toBeTruthy();
+			expect(existsSync(resolve(DIST_DIR, 'pwa-512x512.png'))).toBeTruthy();
+		});
+
+		it('has loading.html fallback page', () => {
+			expect(existsSync(resolve(DIST_DIR, 'loading.html'))).toBeTruthy();
+		});
+	});
+});
@@ -1,13 +1,16 @@
 import tailwindcss from '@tailwindcss/vite';
 import { sveltekit } from '@sveltejs/kit/vite';
+import { SvelteKitPWA } from '@vite-pwa/sveltekit';
 import { dirname, resolve } from 'path';
 import { fileURLToPath } from 'url';

 import { defineConfig, searchForWorkspaceRoot } from 'vite';
-import devtoolsJson from 'vite-plugin-devtools-json';
 import { storybookTest } from '@storybook/addon-vitest/vitest-plugin';
-import { llamaCppBuildPlugin } from './scripts/vite-plugin-llama-cpp-build';
+import { splashScreenPlugin } from './scripts/vite-plugin-splash-screen';
+import { buildInfoPlugin } from './scripts/vite-plugin-build-info';
+import { relativizeBasePlugin } from './scripts/vite-plugin-relativize-base';
 import { playwright } from '@vitest/browser-playwright';
+import { SVELTEKIT_PWA_OPTIONS } from './src/lib/constants/pwa';

 const __dirname = dirname(fileURLToPath(import.meta.url));

@@ -37,7 +40,14 @@ export default defineConfig({
 		minify: true
 	},

-	plugins: [tailwindcss(), sveltekit(), devtoolsJson(), llamaCppBuildPlugin()],
+	plugins: [
+		tailwindcss(),
+		sveltekit(),
+		SvelteKitPWA(SVELTEKIT_PWA_OPTIONS),
+		splashScreenPlugin(),
+		buildInfoPlugin(),
+		relativizeBasePlugin()
+	],

 	test: {
 		projects: [
@@ -478,7 +478,7 @@ bool set_socket_opt_time(socket_t sock, int level, int optname,
 }

 bool is_hex(char c, int &v) {
-  if (isdigit(c)) {
+  if (isdigit(static_cast<unsigned char>(c))) {
    v = c - '0';
    return true;
  } else if ('A' <= c && c <= 'F') {
@@ -731,7 +731,7 @@ std::string sha1(const std::string &input) {
  // Pre-processing: adding padding bits
  std::string msg = input;
  uint64_t original_bit_len = static_cast<uint64_t>(msg.size()) * 8;
-  msg.push_back(static_cast<char>(0x80));
+  msg.push_back(static_cast<char>(0x80u));
  while (msg.size() % 64 != 56) {
    msg.push_back(0);
  }
@@ -4336,7 +4336,7 @@ bool is_multipart_boundary_chars_valid(const std::string &boundary) {
  auto valid = true;
  for (size_t i = 0; i < boundary.size(); i++) {
    auto c = boundary[i];
-    if (!std::isalnum(c) && c != '-' && c != '_') {
+    if (!std::isalnum(static_cast<unsigned char>(c)) && c != '-' && c != '_') {
      valid = false;
      break;
    }
@@ -4545,6 +4545,14 @@ void coalesce_ranges(Ranges &ranges, size_t content_length) {

 bool range_error(Request &req, Response &res) {
  if (!req.ranges.empty() && 200 <= res.status && res.status < 300) {
+    if (res.body.empty() && res.content_provider_ && res.content_length_ == 0) {
+      req.ranges.clear();
+      if (res.status == StatusCode::PartialContent_206) {
+        res.status = StatusCode::OK_200;
+      }
+      return false;
+    }
+
    ssize_t content_len = static_cast<ssize_t>(
        res.content_length_ ? res.content_length_ : res.body.size());

@@ -4813,9 +4821,10 @@ private:
 namespace fields {

 bool is_token_char(char c) {
-  return std::isalnum(c) || c == '!' || c == '#' || c == '$' || c == '%' ||
-         c == '&' || c == '\'' || c == '*' || c == '+' || c == '-' ||
-         c == '.' || c == '^' || c == '_' || c == '`' || c == '|' || c == '~';
+  return std::isalnum(static_cast<unsigned char>(c)) || c == '!' || c == '#' ||
+         c == '$' || c == '%' || c == '&' || c == '\'' || c == '*' ||
+         c == '+' || c == '-' || c == '.' || c == '^' || c == '_' || c == '`' ||
+         c == '|' || c == '~';
 }

 bool is_token(const std::string &s) {
@@ -5306,41 +5315,68 @@ verify_cert_with_windows_schannel(const std::vector<unsigned char> &der_cert,
 }
 #endif // _WIN32

-bool setup_client_tls_session(const std::string &host, tls::ctx_t &ctx,
+// Loads CA file/dir configuration and applies the system CA policy to a
+// client TLS context. PEM data and native stores are applied to the context
+// directly at set time; has_custom_store reflects them for the Auto policy
+// decision.
+bool load_client_ca_config(tls::ctx_t ctx,
+                                  const std::string &ca_cert_file_path,
+                                  const std::string &ca_cert_dir_path,
+                                  bool has_custom_store, SystemCAMode mode,
+                                  uint64_t &backend_error) {
+  auto ret = true;
+
+  if (!ca_cert_file_path.empty()) {
+    if (!tls::load_ca_file(ctx, ca_cert_file_path.c_str())) {
+      backend_error = tls::get_error();
+      ret = false;
+    }
+  } else if (!ca_cert_dir_path.empty()) {
+    if (!tls::load_ca_dir(ctx, ca_cert_dir_path.c_str())) {
+      backend_error = tls::get_error();
+      ret = false;
+    }
+  }
+
+  auto has_custom_ca = !ca_cert_file_path.empty() ||
+                       !ca_cert_dir_path.empty() || has_custom_store;
+  if (mode == SystemCAMode::Enabled ||
+      (mode == SystemCAMode::Auto && !has_custom_ca)) {
+    if (!tls::load_system_certs(ctx)) { backend_error = tls::get_error(); }
+  }
+
+  return ret;
+}
+
+bool setup_client_tls_session(const std::string &host, tls::ctx_t ctx,
                                     tls::session_t &session, socket_t sock,
                                     bool server_certificate_verification,
-                                     const std::string &ca_cert_file_path,
-                                     tls::ca_store_t ca_cert_store,
                                     time_t timeout_sec, time_t timeout_usec) {
  using namespace tls;

-  ctx = create_client_context();
  if (!ctx) { return false; }

-  if (server_certificate_verification) {
-    if (!ca_cert_file_path.empty()) {
-      load_ca_file(ctx, ca_cert_file_path.c_str());
-    }
-    if (ca_cert_store) { set_ca_store(ctx, ca_cert_store); }
-    load_system_certs(ctx);
-  }
-
  bool is_ip = is_ip_address(host);

-#ifdef CPPHTTPLIB_MBEDTLS_SUPPORT
-  if (is_ip && server_certificate_verification) {
-    set_verify_client(ctx, false);
-  } else {
-    set_verify_client(ctx, server_certificate_verification);
-  }
+#if defined(CPPHTTPLIB_MBEDTLS_SUPPORT) || defined(CPPHTTPLIB_WOLFSSL_SUPPORT)
+  // Chain verification happens during the handshake even for IP hosts; the
+  // certificate identity is verified post-handshake via verify_hostname()
+  set_verify_client(ctx, server_certificate_verification);
 #endif

  session = create_session(ctx, sock);
  if (!session) { return false; }

-  // RFC 6066: SNI must not be set for IP addresses
-  if (!is_ip) { set_sni(session, host.c_str()); }
-  if (server_certificate_verification) { set_hostname(session, host.c_str()); }
+  // RFC 6066: SNI must not be set for IP addresses. On Mbed TLS and wolfSSL
+  // set_hostname also sets SNI, so it must be skipped for IP hosts as well;
+  // their identity is checked post-handshake below instead.
+  if (!is_ip) {
+    if (server_certificate_verification) {
+      set_hostname(session, host.c_str());
+    } else {
+      set_sni(session, host.c_str());
+    }
+  }

  if (!connect_nonblocking(session, sock, timeout_sec, timeout_usec, nullptr)) {
    return false;
@@ -5348,6 +5384,14 @@ bool setup_client_tls_session(const std::string &host, tls::ctx_t &ctx,

  if (server_certificate_verification) {
    if (get_verify_result(session) != 0) { return false; }
+
+    // Identity check against the peer certificate, post-handshake for all
+    // backends (same as SSLClient). For IP hosts this is the only identity
+    // verification since no hostname is bound during the handshake.
+    auto server_cert = get_peer_cert(session);
+    if (!server_cert) { return false; }
+    auto cert_guard = detail::scope_exit([&] { free_cert(server_cert); });
+    if (!verify_hostname(server_cert, host.c_str())) { return false; }
  }

  return true;
@@ -7194,6 +7238,11 @@ Server::set_expect_100_continue_handler(Expect100ContinueHandler handler) {
  return *this;
 }

+Server &Server::set_start_handler(StartHandler handler) {
+  start_handler_ = std::move(handler);
+  return *this;
+}
+
 Server &Server::set_address_family(int family) {
  address_family_ = family;
  return *this;
@@ -7889,6 +7938,8 @@ bool Server::listen_internal() {
  is_running_ = true;
  auto se = detail::scope_exit([&]() { is_running_ = false; });

+  if (start_handler_) { start_handler_(); }
+
  {
    std::unique_ptr<TaskQueue> task_queue(new_task_queue());

@@ -8032,26 +8083,26 @@ bool Server::routing(Request &req, Response &res, Stream &strm) {
      }
    }

-    // Read content into `req.body`
-    if (!read_content(strm, req, res)) {
-      output_error_log(Error::Read, &req);
-      return false;
-    }
+    // NOTE: `req.body` is not read here. For a regular handler the body is
+    // read inside dispatch_request(), after the route has matched and the
+    // pre-request handler has approved the request, so that a rejected
+    // request (e.g. failed authentication) never forces us to buffer a
+    // potentially large body.
  }

  // Regular handler
  if (req.method == "GET" || req.method == "HEAD") {
-    return dispatch_request(req, res, get_handlers_);
+    return dispatch_request(req, res, get_handlers_, strm);
  } else if (req.method == "POST") {
-    return dispatch_request(req, res, post_handlers_);
+    return dispatch_request(req, res, post_handlers_, strm);
  } else if (req.method == "PUT") {
-    return dispatch_request(req, res, put_handlers_);
+    return dispatch_request(req, res, put_handlers_, strm);
  } else if (req.method == "DELETE") {
-    return dispatch_request(req, res, delete_handlers_);
+    return dispatch_request(req, res, delete_handlers_, strm);
  } else if (req.method == "OPTIONS") {
-    return dispatch_request(req, res, options_handlers_);
+    return dispatch_request(req, res, options_handlers_, strm);
  } else if (req.method == "PATCH") {
-    return dispatch_request(req, res, patch_handlers_);
+    return dispatch_request(req, res, patch_handlers_, strm);
  }

  res.status = StatusCode::BadRequest_400;
@@ -8059,17 +8110,29 @@ bool Server::routing(Request &req, Response &res, Stream &strm) {
 }

 bool Server::dispatch_request(Request &req, Response &res,
-                                     const Handlers &handlers) const {
+                                     const Handlers &handlers, Stream &strm) {
  for (const auto &x : handlers) {
    const auto &matcher = x.first;
    const auto &handler = x.second;

    if (matcher->match(req)) {
      req.matched_route = matcher->pattern();
-      if (!pre_request_handler_ ||
-          pre_request_handler_(req, res) != HandlerResponse::Handled) {
-        handler(req, res);
+
+      // Run the pre-request handler before reading the body so a rejected
+      // request (e.g. failed authentication) never forces us to buffer a
+      // potentially large body. `req.matched_route` is available here.
+      if (pre_request_handler_ &&
+          pre_request_handler_(req, res) == HandlerResponse::Handled) {
+        return true;
      }
+
+      // The route matched and the request was approved; read the body now.
+      if (detail::expect_content(req) && !read_content(strm, req, res)) {
+        output_error_log(Error::Read, &req);
+        return false;
+      }
+
+      handler(req, res);
      return true;
    }
  }
@@ -8638,6 +8701,7 @@ void ClientImpl::copy_settings(const ClientImpl &rhs) {
  ca_cert_dir_path_ = rhs.ca_cert_dir_path_;
  server_certificate_verification_ = rhs.server_certificate_verification_;
  server_hostname_verification_ = rhs.server_hostname_verification_;
+  system_ca_mode_ = rhs.system_ca_mode_;
 #endif
 }

@@ -9420,6 +9484,7 @@ bool ClientImpl::create_redirect_client(
        server_certificate_verification_);
    redirect_client.enable_server_hostname_verification(
        server_hostname_verification_);
+    redirect_client.system_ca_mode_ = system_ca_mode_;

    // Transfer CA certificate to redirect client
    if (!ca_cert_pem_.empty()) {
@@ -11071,6 +11136,10 @@ void ClientImpl::enable_server_certificate_verification(bool enabled) {
 void ClientImpl::enable_server_hostname_verification(bool enabled) {
  server_hostname_verification_ = enabled;
 }
+
+void ClientImpl::enable_system_ca(bool enabled) {
+  system_ca_mode_ = enabled ? SystemCAMode::Enabled : SystemCAMode::Disabled;
+}
 #endif

 void ClientImpl::set_logger(Logger logger) {
@@ -12156,6 +12225,7 @@ void SSLClient::set_ca_cert_store(tls::ca_store_t ca_cert_store) {
  if (ca_cert_store && ctx_) {
    // set_ca_store takes ownership of ca_cert_store
    tls::set_ca_store(ctx_, ca_cert_store);
+    ca_cert_store_set_ = true;
  } else if (ca_cert_store) {
    tls::free_ca_store(ca_cert_store);
  }
@@ -12192,21 +12262,10 @@ bool SSLClient::load_certs() {
  std::call_once(initialize_cert_, [&]() {
    std::lock_guard<std::mutex> guard(ctx_mutex_);

-    if (!ca_cert_file_path_.empty()) {
-      if (!tls::load_ca_file(ctx_, ca_cert_file_path_.c_str())) {
-        last_backend_error_ = tls::get_error();
-        ret = false;
-      }
-    } else if (!ca_cert_dir_path_.empty()) {
-      if (!tls::load_ca_dir(ctx_, ca_cert_dir_path_.c_str())) {
-        last_backend_error_ = tls::get_error();
-        ret = false;
-      }
-    } else if (ca_cert_pem_.empty()) {
-      if (!tls::load_system_certs(ctx_)) {
-        last_backend_error_ = tls::get_error();
-      }
-    }
+    ret = detail::load_client_ca_config(
+        ctx_, ca_cert_file_path_, ca_cert_dir_path_,
+        !ca_cert_pem_.empty() || ca_cert_store_set_, system_ca_mode_,
+        last_backend_error_);
  });

  return ret;
@@ -12229,13 +12288,9 @@ bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
 #if defined(CPPHTTPLIB_MBEDTLS_SUPPORT) || defined(CPPHTTPLIB_WOLFSSL_SUPPORT)
  // MbedTLS/wolfSSL need explicit verification mode (OpenSSL uses
  // SSL_VERIFY_NONE by default and performs all verification post-handshake).
-  // For IP addresses with verification enabled, use OPTIONAL mode since
-  // these backends require hostname for strict verification.
-  if (is_ip && server_certificate_verification_) {
-    set_verify_client(ctx_, false);
-  } else {
-    set_verify_client(ctx_, server_certificate_verification_);
-  }
+  // Chain verification happens during the handshake even for IP hosts; the
+  // certificate identity is verified post-handshake via verify_hostname().
+  set_verify_client(ctx_, server_certificate_verification_);
 #endif

  // Create TLS session
@@ -12335,9 +12390,12 @@ bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
    // This provides real-time certificate validation with Windows Update
    // integration, working with both OpenSSL and MbedTLS backends.
    // Skip when a custom CA cert is specified, as the Windows certificate
-    // store would not know about user-provided CA certificates.
-    if (enable_windows_cert_verification_ && ca_cert_file_path_.empty() &&
-        ca_cert_dir_path_.empty() && ca_cert_pem_.empty()) {
+    // store would not know about user-provided CA certificates. Also skip
+    // when system CA trust is explicitly disabled.
+    if (enable_windows_cert_verification_ &&
+        system_ca_mode_ != SystemCAMode::Disabled &&
+        ca_cert_file_path_.empty() && ca_cert_dir_path_.empty() &&
+        ca_cert_pem_.empty() && !ca_cert_store_set_) {
      std::vector<unsigned char> der;
      if (get_cert_der(server_cert, der)) {
        uint64_t wincrypt_error = 0;
@@ -12376,6 +12434,10 @@ void Client::enable_server_hostname_verification(bool enabled) {
  cli_->enable_server_hostname_verification(enabled);
 }

+void Client::enable_system_ca(bool enabled) {
+  cli_->enable_system_ca(enabled);
+}
+
 #ifdef CPPHTTPLIB_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE
 void Client::enable_windows_certificate_verification(bool enabled) {
  if (is_ssl_) {
@@ -12399,7 +12461,10 @@ void Client::set_ca_cert_store(tls::ca_store_t ca_cert_store) {
 }

 void Client::load_ca_cert_store(const char *ca_cert, std::size_t size) {
-  set_ca_cert_store(tls::create_ca_store(ca_cert, size));
+  if (is_ssl_) {
+    // Use the PEM-based path so the CA data is retained for redirect transfer
+    static_cast<SSLClient &>(*cli_).load_ca_cert_store(ca_cert, size);
+  }
 }

 void
@@ -13041,15 +13106,22 @@ bool set_hostname(session_t session, const char *hostname) {

  auto ssl = static_cast<SSL *>(session);

-  // Set SNI (Server Name Indication)
-  if (!set_sni(session, hostname)) { return false; }
-
  // Enable hostname verification
  auto param = SSL_get0_param(ssl);
  if (!param) return false;

-  X509_VERIFY_PARAM_set_hostflags(param, X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS);
-  if (X509_VERIFY_PARAM_set1_host(param, hostname, 0) != 1) { return false; }
+  if (detail::is_ip_address(hostname)) {
+    // RFC 6066: SNI must not be set for IP addresses; verify against the
+    // certificate's IP SANs instead of its DNS names
+    if (X509_VERIFY_PARAM_set1_ip_asc(param, hostname) != 1) { return false; }
+  } else {
+    // Set SNI (Server Name Indication)
+    if (!set_sni(session, hostname)) { return false; }
+
+    X509_VERIFY_PARAM_set_hostflags(param,
+                                    X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS);
+    if (X509_VERIFY_PARAM_set1_host(param, hostname, 0) != 1) { return false; }
+  }

  SSL_set_verify(ssl, SSL_VERIFY_PEER, nullptr);
  return true;
@@ -14279,6 +14351,14 @@ session_t create_session(ctx_t ctx, socket_t sock) {
    return nullptr;
  }

+  // Explicitly opt out of in-handshake hostname verification by default;
+  // since Mbed TLS 3.6.4 a client handshake with certificate verification
+  // fails outright when no hostname was set. set_sni() installs the real
+  // hostname for DNS hosts; for IP hosts (where SNI must not be set) the
+  // caller verifies the certificate identity post-handshake via
+  // verify_hostname().
+  mbedtls_ssl_set_hostname(&session->ssl, nullptr);
+
  // Set BIO callbacks
  mbedtls_ssl_set_bio(&session->ssl, &session->sock, impl::mbedtls_net_send_cb,
                      impl::mbedtls_net_recv_cb, nullptr);
@@ -14838,10 +14918,17 @@ bool set_ca_store(ctx_t ctx, ca_store_t store) {
  while (src != nullptr) {
    int ret = mbedtls_x509_crt_parse_der(&mbed_ctx->ca_chain, src->raw.p,
                                         src->raw.len);
-    if (ret != 0) { return false; }
+    if (ret != 0) {
+      free_ca_store(store);
+      return false;
+    }
    src = src->next;
  }

+  // This function takes ownership of the store; the chain was deep-copied
+  // above, so release the source
+  free_ca_store(store);
+
  // Update the SSL config to use the new CA chain
  mbedtls_ssl_conf_ca_chain(&mbed_ctx->conf, &mbed_ctx->ca_chain, nullptr);
  return true;
@@ -15985,6 +16072,9 @@ bool set_ca_store(ctx_t ctx, ca_store_t store) {
      wctx->ctx, reinterpret_cast<const unsigned char *>(ca->pem_data.data()),
      static_cast<long>(ca->pem_data.size()), SSL_FILETYPE_PEM);
  if (ret == SSL_SUCCESS) { wctx->ca_pem_data_ += ca->pem_data; }
+  // This function takes ownership of the store; the PEM data was copied into
+  // the context, so release the source
+  free_ca_store(store);
  return ret == SSL_SUCCESS;
 }

@@ -16343,9 +16433,16 @@ WebSocketClient::WebSocketClient(
    if (!uc.port.empty() && !detail::parse_port(uc.port, port_)) { return; }

    path_ = std::move(uc.path);
+    if (!uc.query.empty()) { path_ += uc.query; }

 #ifdef CPPHTTPLIB_SSL_ENABLED
    is_ssl_ = is_ssl;
+    if (is_ssl_) {
+      // The context lives as long as the client so that CA configuration
+      // survives reconnects; sessions are created per connection.
+      tls_ctx_ = tls::create_client_context();
+      if (!tls_ctx_) { return; }
+    }
 #else
    if (is_ssl) { return; }
 #endif
@@ -16354,7 +16451,15 @@ WebSocketClient::WebSocketClient(
  }
 }

-WebSocketClient::~WebSocketClient() { shutdown_and_close(); }
+WebSocketClient::~WebSocketClient() {
+  shutdown_and_close();
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  if (tls_ctx_) {
+    tls::free_context(tls_ctx_);
+    tls_ctx_ = nullptr;
+  }
+#endif
+}

 bool WebSocketClient::is_valid() const { return is_valid_; }

@@ -16366,10 +16471,6 @@ void WebSocketClient::shutdown_and_close() {
      tls::free_session(tls_session_);
      tls_session_ = nullptr;
    }
-    if (tls_ctx_) {
-      tls::free_context(tls_ctx_);
-      tls_ctx_ = nullptr;
-    }
  }
 #endif
  if (ws_ && ws_->is_open()) { ws_->close(); }
@@ -16384,10 +16485,18 @@ void WebSocketClient::shutdown_and_close() {
 bool WebSocketClient::create_stream(std::unique_ptr<Stream> &strm) {
 #ifdef CPPHTTPLIB_SSL_ENABLED
  if (is_ssl_) {
-    if (!detail::setup_client_tls_session(
-            host_, tls_ctx_, tls_session_, sock_,
-            server_certificate_verification_, ca_cert_file_path_,
-            ca_cert_store_, read_timeout_sec_, read_timeout_usec_)) {
+    if (server_certificate_verification_ && !certs_loaded_) {
+      uint64_t backend_error = 0;
+      detail::load_client_ca_config(tls_ctx_, ca_cert_file_path_, std::string(),
+                                    custom_ca_loaded_, system_ca_mode_,
+                                    backend_error);
+      certs_loaded_ = true;
+    }
+
+    if (!detail::setup_client_tls_session(host_, tls_ctx_, tls_session_, sock_,
+                                          server_certificate_verification_,
+                                          read_timeout_sec_,
+                                          read_timeout_usec_)) {
      return false;
    }

@@ -16407,9 +16516,14 @@ bool WebSocketClient::connect() {
  if (!is_valid_) { return false; }
  shutdown_and_close();

+  // Check is custom IP specified for host_
+  std::string ip;
+  auto it = addr_map_.find(host_);
+  if (it != addr_map_.end()) { ip = it->second; }
+
  Error error;
  sock_ = detail::create_client_socket(
-      host_, std::string(), port_, address_family_, tcp_nodelay_, ipv6_v6only_,
+      host_, ip, port_, address_family_, tcp_nodelay_, ipv6_v6only_,
      socket_options_, connection_timeout_sec_, connection_timeout_usec_,
      read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
      write_timeout_usec_, interface_, error);
@@ -16504,6 +16618,11 @@ void WebSocketClient::set_interface(const std::string &intf) {
  interface_ = intf;
 }

+void WebSocketClient::set_hostname_addr_map(
+    std::map<std::string, std::string> addr_map) {
+  addr_map_ = std::move(addr_map);
+}
+
 #ifdef CPPHTTPLIB_SSL_ENABLED

 void WebSocketClient::set_ca_cert_path(const std::string &path) {
@@ -16511,7 +16630,21 @@ void WebSocketClient::set_ca_cert_path(const std::string &path) {
 }

 void WebSocketClient::set_ca_cert_store(tls::ca_store_t store) {
-  ca_cert_store_ = store;
+  if (store && tls_ctx_) {
+    // set_ca_store takes ownership of store
+    tls::set_ca_store(tls_ctx_, store);
+    custom_ca_loaded_ = true;
+  } else if (store) {
+    tls::free_ca_store(store);
+  }
+}
+
+void WebSocketClient::load_ca_cert_store(const char *ca_cert,
+                                                std::size_t size) {
+  if (tls_ctx_ && ca_cert && size > 0) {
+    tls::load_ca_pem(tls_ctx_, ca_cert, size);
+    custom_ca_loaded_ = true;
+  }
 }

 void
@@ -16519,6 +16652,10 @@ WebSocketClient::enable_server_certificate_verification(bool enabled) {
  server_certificate_verification_ = enabled;
 }

+void WebSocketClient::enable_system_ca(bool enabled) {
+  system_ca_mode_ = enabled ? SystemCAMode::Enabled : SystemCAMode::Disabled;
+}
+
 #endif // CPPHTTPLIB_SSL_ENABLED

 } // namespace ws
@@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H

-#define CPPHTTPLIB_VERSION "0.46.1"
-#define CPPHTTPLIB_VERSION_NUM "0x002e01"
+#define CPPHTTPLIB_VERSION "0.47.0"
+#define CPPHTTPLIB_VERSION_NUM "0x002f00"

 #ifdef _WIN32
 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
@@ -810,6 +810,11 @@ enum class SSLVerifierResponse {
  CertificateRejected
 };

+// System CA loading policy for SSL clients. Auto (the default) loads system
+// CA certs only when no custom CA is configured; enable_system_ca() switches
+// to an explicit policy.
+enum class SystemCAMode { Auto, Enabled, Disabled };
+
 enum StatusCode {
  // Information responses
  Continue_100 = 100,
@@ -1643,6 +1648,8 @@ public:
  using Expect100ContinueHandler =
      std::function<int(const Request &, Response &)>;

+  using StartHandler = std::function<void()>;
+
  using WebSocketHandler =
      std::function<void(const Request &, ws::WebSocket &)>;
  using SubProtocolSelector =
@@ -1694,6 +1701,9 @@ public:
  Server &set_pre_request_handler(HandlerWithResponse handler);

  Server &set_expect_100_continue_handler(Expect100ContinueHandler handler);
+
+  Server &set_start_handler(StartHandler handler);
+
  Server &set_logger(Logger logger);
  Server &set_pre_compression_logger(Logger logger);
  Server &set_error_logger(ErrorLogger error_logger);
@@ -1807,8 +1817,8 @@ private:
                             const std::string &etag, time_t mtime) const;
  bool check_if_range(Request &req, const std::string &etag,
                      time_t mtime) const;
-  bool dispatch_request(Request &req, Response &res,
-                        const Handlers &handlers) const;
+  bool dispatch_request(Request &req, Response &res, const Handlers &handlers,
+                        Stream &strm);
  bool dispatch_request_for_content_reader(
      Request &req, Response &res, ContentReader content_reader,
      const HandlersForContentReader &handlers) const;
@@ -1883,6 +1893,7 @@ private:
  Handler post_routing_handler_;
  HandlerWithResponse pre_request_handler_;
  Expect100ContinueHandler expect_100_continue_handler_;
+  StartHandler start_handler_;

  mutable std::mutex logger_mutex_;
  Logger logger_;
@@ -2445,6 +2456,7 @@ public:
                        const std::string &ca_cert_dir_path = std::string());
  void enable_server_certificate_verification(bool enabled);
  void enable_server_hostname_verification(bool enabled);
+  void enable_system_ca(bool enabled);

 protected:
  std::string digest_auth_username_;
@@ -2455,6 +2467,7 @@ protected:
  std::string ca_cert_dir_path_;
  bool server_certificate_verification_ = true;
  bool server_hostname_verification_ = true;
+  SystemCAMode system_ca_mode_ = SystemCAMode::Auto;
  std::string ca_cert_pem_; // Store CA cert PEM for redirect transfer
  int last_ssl_error_ = 0;
  uint64_t last_backend_error_ = 0;
@@ -2661,6 +2674,7 @@ public:
                             const std::string &password);
  void enable_server_certificate_verification(bool enabled);
  void enable_server_hostname_verification(bool enabled);
+  void enable_system_ca(bool enabled);
  void set_ca_cert_path(const std::string &ca_cert_file_path,
                        const std::string &ca_cert_dir_path = std::string());

@@ -2798,6 +2812,11 @@ private:
  std::mutex ctx_mutex_;
  std::once_flag initialize_cert_;

+  // Tracks whether a custom CA store was applied via set_ca_cert_store(),
+  // since the store handle itself is owned by ctx_ and leaves no other trace.
+  // Used to keep custom CA configuration exclusive with system CA loading.
+  bool ca_cert_store_set_ = false;
+
  long verify_result_ = 0;

  std::function<SSLVerifierResponse(tls::session_t)> session_verifier_;
@@ -3842,11 +3861,14 @@ public:
  void set_socket_options(SocketOptions socket_options);
  void set_connection_timeout(time_t sec, time_t usec = 0);
  void set_interface(const std::string &intf);
+  void set_hostname_addr_map(std::map<std::string, std::string> addr_map);

 #ifdef CPPHTTPLIB_SSL_ENABLED
  void set_ca_cert_path(const std::string &path);
  void set_ca_cert_store(tls::ca_store_t store);
+  void load_ca_cert_store(const char *ca_cert, std::size_t size);
  void enable_server_certificate_verification(bool enabled);
+  void enable_system_ca(bool enabled);
 #endif

 private:
@@ -3876,12 +3898,17 @@ private:
  time_t connection_timeout_usec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND;
  std::string interface_;

+  // Hostname-IP map
+  std::map<std::string, std::string> addr_map_;
+
 #ifdef CPPHTTPLIB_SSL_ENABLED
  bool is_ssl_ = false;
  tls::ctx_t tls_ctx_ = nullptr;
  tls::session_t tls_session_ = nullptr;
  std::string ca_cert_file_path_;
-  tls::ca_store_t ca_cert_store_ = nullptr;
+  bool custom_ca_loaded_ = false;
+  bool certs_loaded_ = false;
+  SystemCAMode system_ca_mode_ = SystemCAMode::Auto;
  bool server_certificate_verification_ = true;
 #endif
 };
Author	SHA1	Message	Date
Ruben Ortlam	3e7bd4f39a	vulkan: add pipeline barriers for memcpy read operations (#23770 ) * vulkan: add pipeline barriers for memcpy read/write operations * remove unnecessary host write pipeline barriers	2026-06-12 16:43:50 +02:00
Aleksander Grygier	f7ca93d12c	ui: PWA support (#23871 ) * feat: Add basic PWA support and service worker for offline caching * feat: Vite PWA implementation WIP * feat: Improve PWA icons generation * feat: Add PWA workbox to server routes * feat: Include `version.json` in static assets * feat: Add HTTP cache headers for PWA static assets * feat: Update app name for `apple-mobile-web-app-title` * feat: Implement PWA versioning and automatic update detection * chore: Update `.gitignore` files * feat: Splash Screens * feat: Add dark mode favicon support * refactor: Cleanup * fix: Use dark logo for dark splash screens * refactor: Simplify favicons SVG code * fix: Adjust caching and polling for reliable service worker updates * fix: Add missing favicon entry * fix: Align PWA service worker configuration with SvelteKit build structure * fix: Replace hashed bundle paths with versioned static paths * test: Add PWA tests * ci: Add build output for unit tests * refactor: Cleanup * fix: Server build & release versioning * chore: Update package-lock.json * chore: Increase PWA cache size * chore: Update packages * feat: Update favicons * refactor: Post-merge fix * feat: support explicit build version for PWA cache busting * fix: CI * feat: Improve PWA Refresh Alert UI * feat: Add toggleable build version display * refactor: Cleanup * feat: Add version mismatch detection and manual app reload * refactor: replace dynamic imports with static * refactor: Cleanup * feat: Add safe space for `pwa-<size>.png` rendered icons * fix: use relative paths for PWA assets to support base path deployment * feat: add PWA mode detection via URL query parameter * feat: Use ?cache=true for SW-cached PWA assets * refactor: Build process cleanup * refactor: Decouple PWA versioning and remove ?cache=true workaround * chore: Update README logo * feat: Include PWA Assets generation in build script * refactor: `usePwa` hook for core layout * fix: Relativize base vite plugin * fix: remove unnecessary backslash escapes in test regexes * test: update static asset paths for API Key test * refactor: Move SvelteKit PWA Options config to constants * ui: fix update notification never appearing Keep the PWA hook object intact instead of destructuring needRefreshByStorage, which freezes the reactive getter. Also exclude loading.html from PWA precache to prevent 404 errors and broken SW installation.	2026-06-12 15:53:26 +02:00
Georgi Gerganov	02182fc5b9	fit : avoid including llama-ext.h in fit.h (#24506 )	2026-06-12 15:57:05 +03:00
Georgi Gerganov	f532be8fac	sync : ggml	2026-06-12 15:55:35 +03:00
Georgi Gerganov	e08c226a2c	ggml : bump version to 0.15.1 (ggml/1541)	2026-06-12 15:55:35 +03:00
Adrien Gallouët	70b54e140c	vendor : update cpp-httplib to 0.47.0 (#24395 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-12 11:34:44 +02:00
Pascal	6471e3c090	UI/jpeg exif orientation (#24196 ) * ui: bake jpeg exif orientation into uploaded images stb_image in mtmd ignores exif metadata, so rotated smartphone photos reach the model with raw pixel orientation. The webui now reads the exif orientation tag at send time and feeds it into the existing capImageDataURLSize canvas pass: the browser applies the rotation when decoding, so capped images come out upright for free, and images under the cap threshold get a single plain redraw when orientation > 1. At most one re-encode ever happens per image. Upright jpegs with capping disabled pass through untouched, bit perfect. Adds jpeg-orientation.ts with a minimal exif parser working on a bounded base64 prefix (both endianness, returns 1 on any malformed input) and unit tests against handcrafted jpeg byte streams. * ui: move jpeg exif constants into lib/constants * ui: add browser test for jpeg orientation and capping Covers capImageDataURLSize end to end in chromium with real Pillow generated jpeg fixtures across exif orientations 1/3/5/6/8: upright quadrant colors checked pixel-wise, expected dimensions with and without capping, no orientation tag left in the output, and strict passthrough when nothing needs rewriting.	2026-06-12 10:20:27 +02:00
Ruixiang Wang	88a39274ec	spec: add EAGLE3 speculative decoding support (#18039 ) * llama : enable layer input extraction * spec: support eagle3 * eagle3: fix params bug * eagle3: support Gemma4 eagle3 from RedHatAI * eagle3: set sync when get features from target Co-authored-by: tnhnyzc <115956684+tnhnyzc@users.noreply.github.com> * eagle3 : fix ubatch handling in embd_layer_inp extraction and encoder Co-authored-by: Doğaç Eldenk <dogacel@gmail.com> * eagle3: adapt to upstream changes * eagle3: fix rebase issues and adapt to upstream changes * eagle3:exclude the eagle3 arch from test-llama-archs * eagle3: fix editorconfig check failures * eagle3: fix multi-seq issue in d2t vocab mapping * cont : minor style / clean-up * spec : remove `common_speculative_setup_draft_model()` * llama : clean-up unused API * eagle3: set d2t vocab mapping in decode graph * cont : assert layer inputs are configured * hparams : use n_embd_inp instead of n_embd_target_features * eagle3: make output.weight optional and inherit from target model when needed * haparams : generic norm-before-residual param * llama-ext : consistent names * cont : fix * hparams : remove target_hidden_size * cparams : rename output_layer_inp -> embeddings_layer_inp * arch : reuse ATTN_NORM_2 instead of adding new hidden norm * llama : clean-up names * cont : add assert + comment * Update conversion/llama.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: tnhnyzc <115956684+tnhnyzc@users.noreply.github.com> Co-authored-by: Doğaç Eldenk <dogacel@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-06-12 10:21:06 +03:00
				`@@ -0,0 +1 @@`
				`export const APP_NAME = import.meta.env?.VITE_PUBLIC_APP_NAME \|\| 'llama-ui';`
				`@@ -0,0 +1 @@`
				`export type SplashDimensions = { deviceW: number; deviceH: number; dpr: number };`