webui: switch to hash-based routing (alternative of #16079 ) (#16157 )

* Switched web UI to hash-based routing * Added hash to missed goto function call * Removed outdated SPA handling code * Fixed broken sidebar home link
Always show message actions for mobile UI + improvements for user message sizing (#16076 )
2026-07-01 01:57:43 +02:00 · 2025-09-26 18:36:48 +03:00 · 2025-09-26 15:59:07 +02:00 · 2025-09-26 16:09:34 +03:00 · 2025-09-26 15:00:44 +02:00 · 2025-09-26 14:14:28 +03:00
117 changed files with 10643 additions and 4228 deletions
@@ -1,10 +1,10 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.2.0
+ARG MUSA_VERSION=rc4.3.0
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_DEV_CONTAINER=sh-harbor.mthreads.com/haive/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_RUN_CONTAINER=sh-harbor.mthreads.com/haive/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

@@ -2,10 +2,10 @@ ARG GCC_VERSION=15.2.0
 ARG UBUNTU_VERSION=24.04

 ### Build Llama.cpp stage
-FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
+FROM gcc:${GCC_VERSION} AS build

-RUN --mount=type=cache,target=/var/cache/apt \
-    --mount=type=cache,target=/var/lib/apt/lists \
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
    apt update -y && \
    apt upgrade -y && \
    apt install -y --no-install-recommends \
@@ -40,7 +40,7 @@ COPY requirements     /opt/llama.cpp/gguf-py/requirements


 ### Collect all llama.cpp binaries, libraries and distro libraries
-FROM --platform=linux/s390x scratch AS collector
+FROM scratch AS collector

 # Copy llama.cpp binaries and libraries
 COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
@@ -49,13 +49,14 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py


 ### Base image
-FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
+FROM ubuntu:${UBUNTU_VERSION} AS base

-RUN --mount=type=cache,target=/var/cache/apt \
-    --mount=type=cache,target=/var/lib/apt/lists \
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
    apt update -y && \
    apt install -y --no-install-recommends \
        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
        curl libgomp1 libopenblas-dev && \
    apt autoremove -y && \
    apt clean -y && \
@@ -68,13 +69,13 @@ COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu


 ### Full
-FROM --platform=linux/s390x base AS full
+FROM base AS full

 ENV PATH="/root/.cargo/bin:${PATH}"
 WORKDIR /app

-RUN --mount=type=cache,target=/var/cache/apt \
-    --mount=type=cache,target=/var/lib/apt/lists \
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
    apt update -y && \
    apt install -y \
        git cmake libjpeg-dev \
@@ -97,7 +98,7 @@ ENTRYPOINT [ "/app/tools.sh" ]


 ### CLI Only
-FROM --platform=linux/s390x base AS light
+FROM base AS light

 WORKDIR /llama.cpp/bin

@@ -108,7 +109,7 @@ ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]


 ### Server
-FROM --platform=linux/s390x base AS server
+FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

@@ -475,7 +475,7 @@ jobs:

  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
+    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64

    steps:
      - name: Clone
@@ -1251,59 +1251,132 @@ jobs:
 # TODO: simplify the following workflows using a matrix
 # TODO: run lighter CI on PRs and the full CI only on master (if needed)
  ggml-ci-x64-cpu-low-perf:
-    runs-on: [self-hosted, Linux, X64, CPU, low-perf]
+    runs-on: ubuntu-22.04

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ggml-ci-x64-cpu-low-perf
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
      - name: Test
        id: ggml-ci
        run: |
-          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-arm64-cpu-low-perf:
-    runs-on: [self-hosted, Linux, ARM64, CPU, low-perf]
+    runs-on: ubuntu-22.04-arm

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ggml-ci-arm64-cpu-low-perf
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
      - name: Test
        id: ggml-ci
        run: |
-          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-x64-cpu-high-perf:
-    runs-on: [self-hosted, Linux, X64, CPU, high-perf]
+    runs-on: ubuntu-22.04

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ggml-ci-x64-cpu-high-perf
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
      - name: Test
        id: ggml-ci
        run: |
-          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-arm64-cpu-high-perf:
-    runs-on: [self-hosted, Linux, ARM64, CPU, high-perf]
+    runs-on: ubuntu-22.04-arm

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4

+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ggml-ci-arm64-cpu-high-perf
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
      - name: Test
        id: ggml-ci
        run: |
-          GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

-  ggml-ci-x64-nvidia-v100-cuda:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, V100]
+  ggml-ci-arm64-cpu-high-perf-sve:
+    runs-on: ubuntu-22.04-arm
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ggml-ci-arm64-cpu-high-perf-sve
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+  ggml-ci-x64-nvidia-cuda:
+    runs-on: [self-hosted, Linux, X64, NVIDIA]

    steps:
      - name: Clone
@@ -1316,8 +1389,8 @@ jobs:
          nvidia-smi
          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  ggml-ci-x64-nvidia-v100-vulkan:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, V100]
+  ggml-ci-x64-nvidia-vulkan-cm:
+    runs-on: [self-hosted, Linux, X64, NVIDIA]

    steps:
      - name: Clone
@@ -1327,51 +1400,23 @@ jobs:
      - name: Test
        id: ggml-ci
        run: |
-          vulkaninfo
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-nvidia-t4-cuda:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-nvidia-t4-vulkan:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-nvidia-t4-vulkan-coopmat1:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo
+          vulkaninfo --summary
          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

+  ggml-ci-x64-nvidia-vulkan-cm2:
+    runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
  ggml-ci-x64-cpu-amx:
    runs-on: [self-hosted, Linux, X64, CPU, AMX]

@@ -1385,31 +1430,33 @@ jobs:
        run: |
          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  ggml-ci-x64-amd-v710-vulkan:
-    runs-on: [self-hosted, Linux, X64, AMD, V710]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-amd-v710-rocm:
-    runs-on: [self-hosted, Linux, X64, AMD, V710]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+#  ggml-ci-x64-amd-vulkan:
+#    runs-on: [self-hosted, Linux, X64, AMD]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          vulkaninfo --summary
+#          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+#
+#  ggml-ci-x64-amd-rocm:
+#    runs-on: [self-hosted, Linux, X64, AMD]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          amd-smi static
+#          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]
@@ -1424,16 +1471,16 @@ jobs:
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-# TODO: install vulkan drivers
-#  ggml-ci-mac-vulkan:
-#    runs-on: [self-hosted, macOS, ARM64]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v4
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  ggml-ci-mac-vulkan:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
@@ -68,22 +68,19 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      - name: Determine tag name
+      - name: Determine source tag name
+        id: srctag
+        uses: ./.github/actions/get-tag-name
+        env:
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+      - name: Determine image tag name
        id: tag
        shell: bash
        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
          REPO_NAME="${{ github.event.repository.name }}"

-          # determine tag name postfix (build number, commit hash)
-          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="-b${BUILD_NUMBER}"
-          else
-            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
-          fi
          # list all tags possible
          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
              TYPE=""
@@ -91,9 +88,9 @@ jobs:
              TYPE="-${{ matrix.config.tag }}"
          fi
          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
-          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
-          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
+          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
+          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
+          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
@@ -101,7 +98,6 @@ jobs:
          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
        env:
-          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

      - name: Free Disk Space (Ubuntu)
@@ -177,3 +173,29 @@ jobs:
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
+
+  create_tag:
+    name: Create and push git tag
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Determine source tag name
+        id: srctag
+        uses: ./.github/actions/get-tag-name
+        env:
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+      - name: Create and push git tag
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          git tag ${{ steps.srctag.outputs.name }} || exit 0
+          git push origin ${{ steps.srctag.outputs.name }} || exit 0
@@ -149,6 +149,6 @@ poetry.toml
 /run-chat.sh
 .ccache/

-# Code Workspace
+# IDE
 *.code-workspace
-
+.windsurf/
@@ -1,7 +0,0 @@
---
-trigger: manual
---
-
-#### Tailwind & CSS
-
-   We are using Tailwind v4 which uses oklch colors so we now want to refer to the CSS vars directly, without wrapping it with any color function like `hsla/hsl`, `rgba` etc.
@@ -1,48 +0,0 @@
---
-trigger: manual
---
-
-# Coding rules
-
-## Svelte & SvelteKit
-
-### Services vs Stores Separation Pattern
-
-#### `lib/services/` - Pure Business Logic
-
-   **Purpose**: Stateless business logic and external communication
-   **Contains**:
-    -   API calls to external services (ApiService)
-    -   Pure business logic functions (ChatService, etc.)
-   **Rules**:
-    -   NO Svelte runes ($state, $derived, $effect)
-    -   NO reactive state management
-    -   Pure functions and classes only
-    -   Can import types but not stores
-    -   Focus on "how" - implementation details
-
-#### `lib/stores/` - Reactive State Management
-
-   **Purpose**: Svelte-specific reactive state with runes
-   **Contains**:
-    -   Reactive state classes with $state, $derived, $effect
-    -   Database operations (DatabaseStore)
-    -   UI-focused state management
-    -   Store orchestration logic
-   **Rules**:
-    -   USE Svelte runes for reactivity
-    -   Import and use services for business logic
-    -   NO direct database operations
-    -   NO direct API calls (use services)
-    -   Focus on "what" - reactive state for UI
-
-#### Enforcement
-
-   Services should be testable without Svelte
-   Stores should leverage Svelte's reactivity system
-   Clear separation: services handle data, stores handle state
-   Services can be reused across multiple stores
-
-#### Misc
-
-   Always use `let` for $derived state variables
@@ -1,9 +0,0 @@
---
-trigger: manual
---
-
-# Automated Tests
-
-## General rules
-
-   NEVER include any test code in the production code - we should always have it in a separate dedicated files
@@ -1,7 +0,0 @@
---
-trigger: manual
---
-
-## TypeScript
-
-   Add JSDocs for functions
@@ -61,9 +61,10 @@
 /ggml/src/ggml-metal/                   @ggerganov
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
+/ggml/src/ggml-rpc/                     @rgerganov
 /ggml/src/ggml-threading.*              @ggerganov @slaren
 /ggml/src/ggml-vulkan/                  @0cc4m
-/ggml/src/ggml-zdnn/                    @taronaeo
+/ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
 /ggml/src/ggml.c                        @ggerganov @slaren
 /ggml/src/ggml.cpp                      @ggerganov @slaren
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
@@ -89,18 +90,20 @@
 /tools/mtmd/                            @ngxson
 /tools/perplexity/                      @ggerganov
 /tools/quantize/                        @ggerganov
+/tools/rpc/                             @rgerganov
 /tools/run/                             @ericcurtin
 /tools/server/*                         @ngxson @ggerganov @ericcurtin # no subdir
 /tools/server/webui/                    @allozaur
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
-.clang-format                           @slaren
-.clang-tidy                             @slaren
-AUTHORS                                 @ggerganov
-CMakeLists.txt                          @ggerganov
-CONTRIBUTING.md                         @ggerganov
-LICENSE                                 @ggerganov
-README.md                               @ggerganov
-SECURITY.md                             @ggerganov
+/.clang-format                          @slaren
+/.clang-tidy                            @slaren
+/AUTHORS                                @ggerganov
+/CMakeLists.txt                         @ggerganov
+/CONTRIBUTING.md                        @ggerganov
+/LICENSE                                @ggerganov
+/README.md                              @ggerganov
+/SECURITY.md                            @ggerganov
+/build-xcframework.sh                   @danbev
 requirements*.txt                       @CISC
@@ -25,7 +25,7 @@ The project differentiates between 3 levels of contributors:
 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
- Let other maintainers, merge their own PRs
+- Let other maintainers merge their own PRs
 - When merging a PR, make sure you have a good understanding of the changes
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

@@ -178,6 +178,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
+- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
@@ -422,6 +422,7 @@ echo "Building for iOS devices..."
 cmake -B build-ios-device -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
+    -DCMAKE_SYSTEM_NAME=iOS \
    -DCMAKE_OSX_SYSROOT=iphoneos \
    -DCMAKE_OSX_ARCHITECTURES="arm64" \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
@@ -21,7 +21,7 @@ docker run --privileged -it \
    -v $HOME/llama.cpp/ci-cache:/ci-cache \
    -v $HOME/llama.cpp/ci-results:/ci-results \
    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
+    mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
 ```

 Inside the container, execute the following commands:
@@ -92,6 +92,12 @@ fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
+
+    # if on Mac, disable METAL
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    fi
+
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
@@ -103,6 +109,11 @@ if [ ! -z ${GG_BUILD_MUSA} ]; then
    MUSA_ARCH=${MUSA_ARCH:-21}
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
 fi
+
+if [ ! -z ${GG_BUILD_NO_SVE} ]; then
+    # arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
+fi
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -339,16 +350,16 @@ function gg_run_qwen3_0_6b {

    wiki_test="${path_wiki}/wiki.test.raw"

-    ./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k
+    ./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)

    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
@@ -421,7 +432,7 @@ function gg_run_qwen3_0_6b {
 function gg_sum_qwen3_0_6b {
    gg_printf '### %s\n\n' "${ci}"

-    gg_printf 'Pythia 2.8B:\n'
+    gg_printf 'Qwen3 0.6B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
@@ -87,7 +87,39 @@ if (LLAMA_CURL)
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
-endif ()
+else()
+    find_package(OpenSSL)
+    if (OpenSSL_FOUND)
+        include(CheckCSourceCompiles)
+        set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
+        check_c_source_compiles("
+        #include <openssl/opensslv.h>
+        #if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
+        #    if OPENSSL_VERSION_NUMBER < 0x1010107f
+        #        error bad version
+        #    endif
+        #else
+        #    if OPENSSL_VERSION_NUMBER < 0x30000000L
+        #        error bad version
+        #    endif
+        #endif
+        int main() { return 0; }
+        " OPENSSL_VERSION_SUPPORTED)
+        if (OPENSSL_VERSION_SUPPORTED)
+            message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
+            target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
+            target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
+            if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+                target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
+                find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
+                find_library(SECURITY_FRAMEWORK Security REQUIRED)
+                target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
+            endif()
+        endif()
+    else()
+        message(STATUS "OpenSSL not found, SSL support disabled")
+    endif()
+endif()

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
@@ -37,6 +37,8 @@
 #if defined(LLAMA_USE_CURL)
 #include <curl/curl.h>
 #include <curl/easy.h>
+#else
+#include <cpp-httplib/httplib.h>
 #endif

 #ifdef __linux__
@@ -572,17 +574,364 @@ bool common_has_curl() {
    return false;
 }

-static bool common_download_file_single_online(const std::string &, const std::string &, const std::string &) {
-    LOG_ERR("error: built without CURL, cannot download model from internet\n");
-    return false;
-}
+struct common_url {
+    std::string scheme;
+    std::string user;
+    std::string password;
+    std::string host;
+    std::string path;
+};

-std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
-    if (!url.empty()) {
-        throw std::runtime_error("error: built without CURL, cannot download model from the internet");
+static common_url parse_url(const std::string & url) {
+    common_url parts;
+    auto scheme_end = url.find("://");
+
+    if (scheme_end == std::string::npos) {
+        throw std::runtime_error("invalid URL: no scheme");
+    }
+    parts.scheme = url.substr(0, scheme_end);
+
+    if (parts.scheme != "http" && parts.scheme != "https") {
+        throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
    }

-    return {};
+    auto rest = url.substr(scheme_end + 3);
+    auto at_pos = rest.find('@');
+
+    if (at_pos != std::string::npos) {
+        auto auth = rest.substr(0, at_pos);
+        auto colon_pos = auth.find(':');
+        if (colon_pos != std::string::npos) {
+            parts.user = auth.substr(0, colon_pos);
+            parts.password = auth.substr(colon_pos + 1);
+        } else {
+            parts.user = auth;
+        }
+        rest = rest.substr(at_pos + 1);
+    }
+
+    auto slash_pos = rest.find('/');
+
+    if (slash_pos != std::string::npos) {
+        parts.host = rest.substr(0, slash_pos);
+        parts.path = rest.substr(slash_pos);
+    } else {
+        parts.host = rest;
+        parts.path = "/";
+    }
+    return parts;
+}
+
+static std::pair<httplib::Client, common_url> http_client(const std::string & url) {
+    common_url parts = parse_url(url);
+
+    if (parts.host.empty()) {
+        throw std::runtime_error("error: invalid URL format");
+    }
+
+    if (!parts.user.empty()) {
+        throw std::runtime_error("error: user:password@ not supported yet"); // TODO
+    }
+
+    httplib::Client cli(parts.scheme + "://" + parts.host);
+    cli.set_follow_location(true);
+
+    // TODO cert
+
+    return { std::move(cli), std::move(parts) };
+}
+
+static std::string show_masked_url(const common_url & parts) {
+    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
+}
+
+static void print_progress(size_t current, size_t total) { // TODO isatty
+    if (!total) {
+        return;
+    }
+
+    size_t width = 50;
+    size_t pct = (100 * current) / total;
+    size_t pos = (width * current) / total;
+
+    std::cout << "["
+              << std::string(pos, '=')
+              << (pos < width ? ">" : "")
+              << std::string(width - pos, ' ')
+              << "] " << std::setw(3) << pct << "%  ("
+              << current / (1024 * 1024) << " MB / "
+              << total / (1024 * 1024) << " MB)\r";
+    std::cout.flush();
+}
+
+struct common_file_metadata {
+    std::string etag;
+    std::string last_modified;
+};
+
+static std::optional<common_file_metadata> read_metadata(const std::string & path) {
+    if (!std::filesystem::exists(path)) {
+        return std::nullopt;
+    }
+
+    nlohmann::json metadata_json;
+    common_file_metadata metadata;
+
+    std::ifstream metadata_in(path);
+    try {
+        metadata_in >> metadata_json;
+        LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, path.c_str(),
+                metadata_json.dump().c_str());
+        if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
+            metadata.etag = metadata_json.at("etag");
+        }
+        if (metadata_json.contains("lastModified") && metadata_json.at("lastModified").is_string()) {
+            metadata.last_modified = metadata_json.at("lastModified");
+        }
+    } catch (const nlohmann::json::exception & e) {
+        LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, path.c_str(), e.what());
+        return std::nullopt;
+    }
+
+    return metadata;
+}
+
+static void write_metadata(const std::string & path,
+                           const std::string & url,
+                           const common_file_metadata & metadata) {
+    nlohmann::json metadata_json = {
+        { "url",          url                    },
+        { "etag",         metadata.etag          },
+        { "lastModified", metadata.last_modified }
+    };
+
+    write_file(path, metadata_json.dump(4));
+    LOG_DBG("%s: file metadata saved: %s\n", __func__, path.c_str());
+}
+
+static bool common_pull_file(httplib::Client & cli,
+                             const std::string & resolve_path,
+                             const std::string & path_tmp,
+                             bool supports_ranges,
+                             size_t existing_size,
+                             size_t & total_size) {
+    std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
+    if (!ofs.is_open()) {
+        LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
+        return false;
+    }
+
+    httplib::Headers headers;
+    if (supports_ranges && existing_size > 0) {
+        headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
+    }
+
+    std::atomic<size_t> downloaded{existing_size};
+
+    auto res = cli.Get(resolve_path, headers,
+        [&](const httplib::Response &response) {
+            if (existing_size > 0 && response.status != 206) {
+                LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
+                return false;
+            }
+            if (existing_size == 0 && response.status != 200) {
+                LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
+                return false;
+            }
+            if (total_size == 0 && response.has_header("Content-Length")) {
+                try {
+                    size_t content_length = std::stoull(response.get_header_value("Content-Length"));
+                    total_size = existing_size + content_length;
+                } catch (const std::exception &e) {
+                    LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
+                }
+            }
+            return true;
+        },
+        [&](const char *data, size_t len) {
+            ofs.write(data, len);
+            if (!ofs) {
+                LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
+                return false;
+            }
+            downloaded += len;
+            print_progress(downloaded, total_size);
+            return true;
+        },
+        nullptr
+    );
+
+    std::cout << "\n";
+
+    if (!res) {
+        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
+        return false;
+    }
+
+    return true;
+}
+
+// download one single file from remote URL to local path
+static bool common_download_file_single_online(const std::string & url,
+                                               const std::string & path,
+                                               const std::string & bearer_token) {
+    // If the file exists, check its JSON metadata companion file.
+    std::string metadata_path = path + ".json";
+    static const int max_attempts        = 3;
+    static const int retry_delay_seconds = 2;
+
+    auto [cli, parts] = http_client(url);
+
+    httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
+    if (!bearer_token.empty()) {
+        default_headers.insert({"Authorization", "Bearer " + bearer_token});
+    }
+    cli.set_default_headers(default_headers);
+
+    common_file_metadata last;
+    const bool file_exists = std::filesystem::exists(path);
+    if (file_exists) {
+        if (auto opt = read_metadata(metadata_path)) {
+            last = *opt;
+        }
+    } else {
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+    }
+
+    for (int i = 0; i < max_attempts; ++i) {
+        auto head = cli.Head(parts.path);
+        bool head_ok = head && head->status >= 200 && head->status < 300;
+        if (!head_ok) {
+            LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
+            if (file_exists) {
+                LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
+                return true;
+            }
+        }
+
+        common_file_metadata current;
+        if (head_ok) {
+            if (head->has_header("ETag")) {
+                current.etag = head->get_header_value("ETag");
+            }
+            if (head->has_header("Last-Modified")) {
+                current.last_modified = head->get_header_value("Last-Modified");
+            }
+        }
+
+        size_t total_size = 0;
+        if (head_ok && head->has_header("Content-Length")) {
+            try {
+                total_size = std::stoull(head->get_header_value("Content-Length"));
+            } catch (const std::exception& e) {
+                LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
+            }
+        }
+
+        bool supports_ranges = false;
+        if (head_ok && head->has_header("Accept-Ranges")) {
+            supports_ranges = head->get_header_value("Accept-Ranges") != "none";
+        }
+
+        bool should_download_from_scratch = false;
+        if (head_ok) {
+            if (!last.etag.empty() && last.etag != current.etag) {
+                LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
+                        last.etag.c_str(), current.etag.c_str());
+                should_download_from_scratch = true;
+            } else if (!last.last_modified.empty() && last.last_modified != current.last_modified) {
+                LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
+                        last.last_modified.c_str(), current.last_modified.c_str());
+                should_download_from_scratch = true;
+            }
+        }
+
+        if (file_exists) {
+            if (!should_download_from_scratch) {
+                LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
+                return true;
+            }
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                return false;
+            }
+        }
+
+        const std::string path_temporary = path + ".downloadInProgress";
+        size_t existing_size = 0;
+
+        if (std::filesystem::exists(path_temporary)) {
+            if (supports_ranges && !should_download_from_scratch) {
+                existing_size = std::filesystem::file_size(path_temporary);
+            } else if (remove(path_temporary.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
+                return false;
+            }
+        }
+
+        // start the download
+        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
+                __func__, show_masked_url(parts).c_str(), path_temporary.c_str(),
+                current.etag.c_str(), current.last_modified.c_str());
+        const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
+        if (!was_pull_successful) {
+            if (i + 1 < max_attempts) {
+                const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
+                LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
+                std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+            } else {
+                LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
+            }
+
+            continue;
+        }
+
+        if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            return false;
+        }
+        write_metadata(metadata_path, url, current);
+        break;
+    }
+
+    return true;
+}
+
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string          & url,
+                                                             const common_remote_params & params) {
+    auto [cli, parts] = http_client(url);
+
+    httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
+    for (const auto & header : params.headers) {
+        size_t pos = header.find(':');
+        if (pos != std::string::npos) {
+            headers.emplace(header.substr(0, pos), header.substr(pos + 1));
+        } else {
+            headers.emplace(header, "");
+        }
+    }
+
+    if (params.timeout > 0) {
+        cli.set_read_timeout(params.timeout, 0);
+        cli.set_write_timeout(params.timeout, 0);
+    }
+
+    std::vector<char> buf;
+    auto res = cli.Get(parts.path, headers,
+        [&](const char *data, size_t len) {
+            buf.insert(buf.end(), data, data + len);
+            return params.max_size == 0 ||
+                   buf.size() <= static_cast<size_t>(params.max_size);
+        },
+        nullptr
+    );
+
+    if (!res) {
+        throw std::runtime_error("error: cannot make GET request");
+    }
+
+    return { res->status, std::move(buf) };
 }

 #endif // LLAMA_USE_CURL
@@ -14,6 +14,7 @@
 #include <climits>
 #include <cmath>
 #include <codecvt>
+#include <chrono>
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
@@ -960,15 +961,13 @@ struct common_init_result common_init_from_params(common_params & params) {

        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+        bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;

-        if (!has_eos && !has_sep) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+        if (!has_eos && !has_sep && !has_rerank_prompt) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
            ok = false;
        } else if (!has_eos) {
            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
-        } else if (!has_sep) {
-            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
-            ok = false;
        }

        if (!ok) {
@@ -738,7 +738,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 // MoE utils
 //

-const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";

 static std::string llm_ffn_exps_block_regex(int idx) {
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
    }
    if (ctx) {
        llama_perf_context_print(ctx);
+        llama_memory_breakdown_print(ctx);
    }
 }

@@ -3717,11 +3717,29 @@ class Qwen2MoeModel(TextModel):
 class Qwen3Model(Qwen2Model):
    model_arch = gguf.MODEL_ARCH.QWEN3

+    # extra logic for rerank models
+    is_rerank: bool = False
+    is_tied_embeddings: bool = False
+    token_false_id: int | None = None
+    token_true_id: int | None = None
+
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
+
+        # track for intern-s1-mini
        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

+        # a bit hacky, but currently the only way to detect if this is a rerank model
+        # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
+        readme_path = self.dir_model / "README.md"
+        readme_text = ""
+        if readme_path.exists():
+            with readme_path.open("r", encoding="utf-8") as f:
+                readme_text = f.read()
+        if "# Qwen3-Reranker" in readme_text:
+            self._find_rerank_config()
+
    def set_vocab(self):
        # deal with intern-s1-mini
        if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
@@ -3730,6 +3748,53 @@ class Qwen3Model(Qwen2Model):

        super().set_vocab()

+    def _find_rerank_config(self):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+
+        self.is_rerank = True
+        self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
+        self.token_false_id = tokenizer.convert_tokens_to_ids("no")
+        self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
+        self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
+
+        assert self.token_false_id is not None and self.token_true_id is not None
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if self.is_rerank:
+            self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
+            self.gguf_writer.add_classifier_output_labels(["yes", "no"])
+            self.gguf_writer.add_chat_template([{
+                "name": "rerank",
+                "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
+                            "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
+                            "<|im_start|>assistant\n<think>\n\n</think>\n\n"
+            }])
+
+    def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
+        # extract "yes" and "no" tokens from the output lm_head tensor
+        false_row = data_torch[self.token_false_id]
+        true_row = data_torch[self.token_true_id]
+        return torch.stack([true_row, false_row], dim=0)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if self.is_rerank:
+            is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
+            is_real_head = not self.is_tied_embeddings and "lm_head" in name
+            if is_tied_head or is_real_head:
+                cls_out_head = (
+                    gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
+                    self._get_cls_out_tensor(data_torch),
+                )
+                if is_tied_head:
+                    embed = (self.map_tensor_name(name), data_torch)
+                    return [cls_out_head, embed]
+                if is_real_head:
+                    return [cls_out_head]
+
+        return super().modify_tensors(data_torch, name, bid)
+

@ModelBase.register("Qwen3MoeForCausalLM")
 class Qwen3MoeModel(Qwen2MoeModel):
@@ -7930,6 +7995,121 @@ class BailingMoeModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
+class GroveMoeModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.GROVEMOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
+        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
+        self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
+        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
+        self.gguf_writer.add_experts_per_group(2)
+        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
+        self.gguf_writer.add_expert_group_scale(0.05)
+        # YaRN is not enabled by default
+        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+    _chunk_experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith(".expert_bias"):
+            # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
+            return []
+
+        # process the experts separately
+        if name.find("chunk_experts") != -1:
+            n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
+            assert bid is not None
+
+            if self._chunk_experts is None:
+                self._chunk_experts = [{} for _ in range(self.block_count)]
+
+            self._chunk_experts[bid][name] = data_torch
+
+            if len(self._chunk_experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
+                        datas.append(self._chunk_experts[bid][ename])
+                        del self._chunk_experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+        elif name.find("experts") != -1:
+            n_experts = self.hparams["num_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._chunk_experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
+            if len(chunk_experts) > 0:
+                raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
@ModelBase.register("ChameleonForConditionalGeneration")
@ModelBase.register("ChameleonForCausalLM")  # obsolete
 class ChameleonModel(TextModel):
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment

 The defaults are:

- `MUSA_VERSION` set to `rc4.2.0`
+- `MUSA_VERSION` set to `rc4.3.0`

 The resulting images, are essentially the same as the non-MUSA images:

@@ -95,8 +95,13 @@ int main(int argc, char ** argv) {
        params.n_batch = params.n_ctx;
    }

-    // For non-causal models, batch size must be equal to ubatch size
-    params.n_ubatch = params.n_batch;
+    // for non-causal models, batch size must be equal to ubatch size
+    if (params.attention_type != LLAMA_ATTENTION_TYPE_CAUSAL) {
+        params.n_ubatch = params.n_batch;
+    }
+
+    // get max number of sequences per batch
+    const int n_seq_max = llama_max_parallel_sequences();

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -144,6 +149,7 @@ int main(int argc, char ** argv) {
    // get added sep and eos token, if any
    const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
    const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
+    const char * rerank_prompt = llama_model_chat_template(model, "rerank");

    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
@@ -153,21 +159,28 @@ int main(int argc, char ** argv) {
        // split classification pairs and insert expected separator tokens
        if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
            std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
-            std::string final_prompt;
-
-            for (size_t i = 0; i < pairs.size(); i++) {
-                final_prompt += pairs[i];
-                if (i != pairs.size() - 1) {
-                    if (!added_eos_token.empty()) {
-                        final_prompt += added_eos_token;
-                    }
-                    if (!added_sep_token.empty()) {
-                        final_prompt += added_sep_token;
+            if (rerank_prompt != nullptr) {
+                const std::string query = pairs[0];
+                const std::string doc = pairs[1];
+                std::string final_prompt = rerank_prompt;
+                string_replace_all(final_prompt, "{query}"   , query);
+                string_replace_all(final_prompt, "{document}", doc  );
+                inp = common_tokenize(vocab, final_prompt, true, true);
+            } else {
+                std::string final_prompt;
+                for (size_t i = 0; i < pairs.size(); i++) {
+                    final_prompt += pairs[i];
+                    if (i != pairs.size() - 1) {
+                        if (!added_eos_token.empty()) {
+                            final_prompt += added_eos_token;
+                        }
+                        if (!added_sep_token.empty()) {
+                            final_prompt += added_sep_token;
+                        }
                    }
                }
+                inp = common_tokenize(ctx, final_prompt, true, true);
            }
-
-            inp = common_tokenize(ctx, final_prompt, true, true);
        } else {
            inp = common_tokenize(ctx, prompt, true, true);
        }
@@ -229,7 +242,7 @@ int main(int argc, char ** argv) {
        const uint64_t n_toks = inp.size();

        // encode if at capacity
-        if (batch.n_tokens + n_toks > n_batch) {
+        if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
            float * out = emb + e * n_embd;
            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
@@ -118,13 +118,17 @@ embedding-convert-model:

 embedding-run-original-model:
 	$(call validate_embedding_model_path,embedding-run-original-model)
-	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py
+	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
+	./scripts/embedding/run-original-model.py \
+	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

 embedding-run-converted-model:
-	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
+	@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
+	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

 embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
-	@./scripts/embedding/compare-embeddings-logits.sh
+	@./scripts/embedding/compare-embeddings-logits.sh \
+	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

 embedding-inspect-original-model:
 	$(call validate_embedding_model_path,embedding-inspect-original-model)
@@ -156,7 +160,8 @@ embedding-quantize-model:
 	$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)

 embedding-run-quantized-model:
-	@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
+	@./scripts/embedding/run-converted-model.sh $(QUANTIZED_EMBEDDING_MODEL) \
+	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

 ###
 ### Perplexity targets/recipes
@@ -105,12 +105,12 @@ new model, the model can be converted to GGUF format using the following command
 ### Inspecting the converted model
 The converted model can be inspected using the following command:
 ```console
-(venv) $ make inspect-converted-model
+(venv) $ make causal-inspect-converted-model
 ```

 ### Running the converted model
 ```console
-(venv) $ make run-converted-model
+(venv) $ make causal-run-converted-model
 ```

 ### Model logits verfication
@@ -151,6 +151,35 @@ int main(int argc, char ** argv) {
        logits = llama_get_embeddings(ctx);
        n_logits = llama_model_n_embd(model) * batch.n_tokens;
        type = "-embeddings";
+
+        const int n_embd = llama_model_n_embd(model);
+        const int n_embd_count = batch.n_tokens;
+
+        printf("Embedding dimension: %d\n", n_embd);
+        printf("\n");
+
+        // Print embeddings in the specified format
+        for (int j = 0; j < n_embd_count; j++) {
+            printf("embedding %d: ", j);
+
+            // Print first 3 values
+            for (int i = 0; i < 3 && i < n_embd; i++) {
+                printf("%9.6f ", logits[j * n_embd + i]);
+            }
+
+            printf(" ... ");
+
+            // Print last 3 values
+            for (int i = n_embd - 3; i < n_embd; i++) {
+                if (i >= 0) {
+                    printf("%9.6f ", logits[j * n_embd + i]);
+                }
+            }
+
+            printf("\n");
+        }
+        printf("\n");
+
        printf("Embeddings size: %d\n", n_logits);
    } else {
        logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
@@ -183,22 +212,23 @@ int main(int argc, char ** argv) {
        return 1;
    }
    for (int i = 0; i < n_logits; i++) {
-        fprintf(f, "%d: %.6f\n", i, logits[i]);  // Added index and changed format
+        fprintf(f, "%d: %.6f\n", i, logits[i]);
    }
    fclose(f);

-    // Print first and last 10 logits for quick verification
-    printf("First 10 logits: ");
-    for (int i = 0; i < 10 && i < n_logits; i++) {
-        printf("%.6f ", logits[i]);
-    }
-    printf("\n");
+    if (!embedding_mode) {
+        printf("First 10 logits: ");
+        for (int i = 0; i < 10 && i < n_logits; i++) {
+            printf("%.6f ", logits[i]);
+        }
+        printf("\n");

-    printf("Last 10 logits: ");
-    for (int i = n_logits - 10; i < n_logits; i++) {
-        if (i >= 0) printf("%.6f ", logits[i]);
+        printf("Last 10 logits: ");
+        for (int i = n_logits - 10; i < n_logits; i++) {
+            if (i >= 0) printf("%.6f ", logits[i]);
+        }
+        printf("\n\n");
    }
-    printf("\n\n");

    printf("Logits saved to %s\n", bin_filename);
    printf("Logits saved to %s\n", txt_filename);
@@ -48,7 +48,7 @@ def main():
        print(f"Error: Model file not found: {model_path}")
        sys.exit(1)

-    model_name = os.path.splitext(os.path.basename(model_path))[0]
+    model_name = os.path.basename(model_path)
    data_dir = Path("data")

    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
@@ -193,7 +193,7 @@ print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")

 with torch.no_grad():
-    outputs = model(input_ids)
+    outputs = model(input_ids.to(model.device))
    logits = outputs.logits

    # Extract logits for the last token (next token prediction)
@@ -2,8 +2,37 @@

 set -e

-MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
-MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
+# Parse command line arguments
+MODEL_PATH=""
+MODEL_NAME=""
+PROMPTS_FILE=""
+
+# First argument is always model path
+if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
+    MODEL_PATH="$1"
+    shift
+fi
+
+# Parse remaining arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --prompts-file|-pf)
+            PROMPTS_FILE="$2"
+            shift 2
+            ;;
+        *)
+            # If MODEL_NAME not set and this isn't a flag, use as model name
+            if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
+                MODEL_NAME="$1"
+            fi
+            shift
+            ;;
+    esac
+done
+
+# Set defaults
+MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
+MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"

 if [ -t 0 ]; then
    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
@@ -35,8 +64,18 @@ with open('$TEMP_FILE', 'wb') as f:
    trap "rm -f $TEMP_FILE" EXIT
 fi

-python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
+# Build the semantic_check.py command
+SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
-    --cpp-embeddings $CPP_EMBEDDINGS \
-    --prompt "Hello world today"
+    --cpp-embeddings $CPP_EMBEDDINGS"
+
+# Add prompts file if specified, otherwise use default prompt
+if [ -n "$PROMPTS_FILE" ]; then
+    SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
+else
+    SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
+fi
+
+# Execute the command
+eval $SEMANTIC_CMD

@@ -2,8 +2,27 @@

 set -e

-# First try command line argument, then environment variable, then file
-CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"
+# Parse command line arguments
+CONVERTED_MODEL=""
+PROMPTS_FILE=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -p|--prompts-file)
+            PROMPTS_FILE="$2"
+            shift 2
+            ;;
+        *)
+            if [ -z "$CONVERTED_MODEL" ]; then
+                CONVERTED_MODEL="$1"
+            fi
+            shift
+            ;;
+    esac
+done
+
+# First try command line argument, then environment variable
+CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"

 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
@@ -13,8 +32,19 @@ if [ -z "$CONVERTED_MODEL" ]; then
    exit 1
 fi

+# Read prompt from file or use default
+if [ -n "$PROMPTS_FILE" ]; then
+    if [ ! -f "$PROMPTS_FILE" ]; then
+        echo "Error: Prompts file '$PROMPTS_FILE' not found" >&2
+        exit 1
+    fi
+    PROMPT=$(cat "$PROMPTS_FILE")
+else
+    PROMPT="Hello world today"
+fi
+
 echo $CONVERTED_MODEL

 cmake --build ../../build --target llama-logits -j8
-
-../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"
+# TODO: update logits.cpp to accept a --file/-f option for the prompt
+../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
@@ -13,14 +13,37 @@ unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
+parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
 args = parser.parse_args()

+def read_prompt_from_file(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        print(f"Error: Prompts file '{file_path}' not found")
+        exit(1)
+    except Exception as e:
+        print(f"Error reading prompts file: {e}")
+        exit(1)
+
 model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")

 tokenizer = AutoTokenizer.from_pretrained(model_path)

+config = AutoConfig.from_pretrained(model_path)
+
+# This can be used to override the sliding window size for manual testing. This
+# can be useful to verify the sliding window attention mask in the original model
+# and compare it with the converted .gguf model.
+if hasattr(config, 'sliding_window'):
+    original_sliding_window = config.sliding_window
+    #original_sliding_window = 6
+    print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
+
+print(f"Using unreleased model: {unreleased_model_name}")
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
@@ -29,19 +52,28 @@ if unreleased_model_name:

    try:
        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
+        model = model_class.from_pretrained(model_path, config=config)
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
        exit(1)
 else:
-    model = AutoModel.from_pretrained(model_path)
+    model = AutoModel.from_pretrained(model_path, config=config)
 print(f"Model class: {type(model)}")
-#print(f"Model file: {type(model).__module__}")
-config = AutoConfig.from_pretrained(model_path)
+print(f"Model file: {type(model).__module__}")
+
+# Verify the model is using the correct sliding window
+if hasattr(model.config, 'sliding_window'):
+    print(f"Model's sliding_window: {model.config.sliding_window}")
+else:
+    print("Model config does not have sliding_window attribute")

 model_name = os.path.basename(model_path)

-texts = [ "Hello world today" ]
+if args.prompts_file:
+    prompt_text = read_prompt_from_file(args.prompts_file)
+    texts = [prompt_text]
+else:
+    texts = ["Hello world today"]

 encoded = tokenizer(
    texts,
@@ -67,7 +67,7 @@ def main():
    parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
    args = parser.parse_args()

-    model_name = os.path.splitext(os.path.basename(args.model_path))[0]
+    model_name = os.path.basename(args.model_path)
    data_dir = Path("data")

    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
@@ -40,7 +40,7 @@ if os.path.exists(index_path):
        file_path = os.path.join(model_path, file_name)
        print(f"\n--- From {file_name} ---")

-        with safe_open(file_path, framework="pt") as f:  # type: ignore
+        with safe_open(file_path, framework="pt") as f:
            for tensor_name in sorted(tensor_names):
                tensor = f.get_tensor(tensor_name)
                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
@@ -49,7 +49,7 @@ elif os.path.exists(single_file_path):
    # Single file model (original behavior)
    print("Single-file model detected")

-    with safe_open(single_file_path, framework="pt") as f:  # type: ignore
+    with safe_open(single_file_path, framework="pt") as f:
        keys = f.keys()
        print("Tensors in model:")
        for key in sorted(keys):
@@ -101,6 +101,17 @@ def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
        'rms_diff': np.sqrt(np.mean(diff_matrix**2))
    }

+def read_prompt_from_file(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        print(f"Error: Prompts file '{file_path}' not found")
+        exit(1)
+    except Exception as e:
+        print(f"Error reading prompts file: {e}")
+        exit(1)
+
 def main():
    parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
    parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
@@ -108,14 +119,20 @@ def main():
    parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
    parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
    parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
+    parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')

    args = parser.parse_args()

+    if args.prompts_file:
+        prompt = read_prompt_from_file(args.prompts_file)
+    else:
+        prompt = args.prompt
+
    print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
    print("=" * 70)

    # Single prompt detailed comparison
-    print(f"\nTesting with prompt: '{args.prompt}'")
+    print(f"\nTesting with prompt: '{prompt}'")

    # Load the python model to get configuration information and also to load the tokenizer.
    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
@@ -144,7 +161,7 @@ def main():
        else:
            model = AutoModel.from_pretrained(args.model_path)

-    encoded = tokenizer(args.prompt, return_tensors="pt")
+    encoded = tokenizer(prompt, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    n_tokens = len(tokens)
    print(f"n_tokens: {n_tokens}");
@@ -155,7 +172,7 @@ def main():
    python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)

    # Run comparison
-    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)
+    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)

    # Summary
    print(f"\n=== SUMMARY ===")
@@ -177,7 +177,7 @@ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")


 if (MINGW)
-    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
+    set(GGML_WIN_VER "0xA00" CACHE STRING   "ggml: Windows version")
 endif()

 # ggml core
@@ -314,7 +314,8 @@ extern "C" {
    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);

-    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
@@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
 }

 // ops that return true for this function must not use restrict pointers for their backend implementations
-static bool ggml_op_can_inplace(enum ggml_op op) {
+bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
        case GGML_OP_SCALE:
        case GGML_OP_DIAG_MASK_ZERO:
@@ -95,39 +95,104 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te

 // dynamic tensor allocator

+#define GGML_VBUFFER_MAX_CHUNKS 16
+
+// relative memory address within an allocation that can be split into multiple buffers (chunks)
+struct buffer_address {
+    int chunk;     // index of a backend buffer
+    size_t offset; // local memory offset within the buffer
+};
+
+static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
+
+static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
+    return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
+}
+
 struct free_block {
    size_t offset;
    size_t size;
 };

+struct tallocr_chunk {
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    int n_free_blocks;
+    size_t max_size;
+};
+
 struct ggml_dyn_tallocr {
    size_t alignment;
-    int n_free_blocks;
-    struct free_block free_blocks[MAX_FREE_BLOCKS];
-    size_t max_size;
+    size_t max_chunk_size;
+    struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
+    int n_chunks;

 #ifdef GGML_ALLOCATOR_DEBUG
    struct {
        const struct ggml_tensor * tensor;
-        size_t offset;
+        struct buffer_address addr;
    } allocated_tensors[1024];
 #endif
 };

+static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
+    GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = 0;
+    while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
+        chunk->free_blocks[i] = chunk->free_blocks[i-1];
+    }
+    // insert the new block
+    chunk->free_blocks[insert_pos].offset = offset;
+    chunk->free_blocks[insert_pos].size = size;
+    chunk->n_free_blocks++;
+}
+
+static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
+    // shift all elements after idx by 1 to the left, overwriting the element at idx
+    for (int i = idx; i < chunk->n_free_blocks; i++) {
+        chunk->free_blocks[i] = chunk->free_blocks[i+1];
+    }
+    chunk->n_free_blocks--;
+}
+
+static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
+    if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
+        return -1;
+    }
+    struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
+    chunk->n_free_blocks = 1;
+    chunk->free_blocks[0].offset = 0;
+    // available space in a chunk is limited to max_chunk_size, but can be higher if:
+    // 1. a single tensor exceeds the maximum, and cannot fit any other way
+    // 2. we are running out of chunks
+    // backends will either manage to allocate the larger size, or report an error.
+    chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
+    if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
+        chunk->free_blocks[0].size = SIZE_MAX/2;
+    }
+    alloc->chunks[alloc->n_chunks] = chunk;
+    alloc->n_chunks++;
+    return alloc->n_chunks - 1;
+}
+
 #ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
+static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
        if (alloc->allocated_tensors[i].tensor == NULL) {
            alloc->allocated_tensors[i].tensor = tensor;
-            alloc->allocated_tensors[i].offset = offset;
+            alloc->allocated_tensors[i].addr = addr;
            return;
        }
    }
    GGML_ABORT("out of allocated_tensors");
 }
-static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
+static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
-        if (alloc->allocated_tensors[i].offset == offset) {
+        if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
            alloc->allocated_tensors[i].tensor = NULL;
            return;
        }
@@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
 }
 #endif

-static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
+static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
    size = aligned_offset(NULL, size, alloc->alignment);

    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);

+    int best_fit_chunk = -1;
+    int best_fit_block = -1;
    size_t max_avail = 0;

-    // find the best fitting free block besides the last block
-    int best_fit_block = -1;
-    size_t best_fit_size = SIZE_MAX;
-    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size >= size && block->size <= best_fit_size) {
-            best_fit_block = i;
-            best_fit_size = block->size;
+    // find the best fitting free block besides the last block, within any chunk
+    for (int c = 0; c < alloc->n_chunks; ++c) {
+        struct tallocr_chunk * chunk = alloc->chunks[c];
+        size_t best_fit_size = SIZE_MAX;
+        for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
+            struct free_block * block = &chunk->free_blocks[i];
+            max_avail = MAX(max_avail, block->size);
+            if (block->size >= size && block->size <= best_fit_size) {
+                best_fit_chunk = c;
+                best_fit_block = i;
+                best_fit_size = block->size;
+            }
        }
    }

    if (best_fit_block == -1) {
-        // the last block is our last resort
-        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size >= size) {
-            best_fit_block = alloc->n_free_blocks - 1;
-        } else {
-            // this should never happen
-            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
-                    __func__, size, max_avail);
-            GGML_ABORT("not enough space in the buffer");
-        }
-    }
-
-    struct free_block * block = &alloc->free_blocks[best_fit_block];
-    size_t offset = block->offset;
-    block->offset = offset + size;
-    block->size -= size;
-    if (block->size == 0) {
-        // remove block if empty
-        alloc->n_free_blocks--;
-        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
-            alloc->free_blocks[j] = alloc->free_blocks[j+1];
-        }
-    }
-
-    AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    add_allocated_tensor(alloc, offset, tensor);
-    size_t cur_max = offset + size;
-    if (cur_max > alloc->max_size) {
-        // sort allocated_tensors by offset
-        for (int i = 0; i < 1024; i++) {
-            for (int j = i + 1; j < 1024; j++) {
-                if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
-                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
-                    size_t tmp_offset = alloc->allocated_tensors[i].offset;
-                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
-                    alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
-                    alloc->allocated_tensors[j].tensor = tmp_tensor;
-                    alloc->allocated_tensors[j].offset = tmp_offset;
+        // no suitable block found, try the last block (this will grow a chunks size)
+        for (int c = 0; c < alloc->n_chunks; ++c) {
+            struct tallocr_chunk * chunk = alloc->chunks[c];
+            if (chunk->n_free_blocks > 0) {
+                struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
+                max_avail = MAX(max_avail, block->size);
+                if (block->size >= size) {
+                    best_fit_chunk = c;
+                    best_fit_block = chunk->n_free_blocks - 1;
+                    break;
                }
            }
        }
-        GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+    }
+
+    if (best_fit_block == -1) {
+        // none of the existing chunks have enough space left
+        best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
+        best_fit_block = 0;
+    }
+    if (best_fit_chunk == -1) {
+        // since the last chunk always has virtually endless memory, this should never happen
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+            __func__, size, max_avail);
+        GGML_ABORT("graph allocation: failed to reserve memory");
+    }
+
+    struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
+    struct free_block    * block = &chunk->free_blocks[best_fit_block];
+    struct buffer_address  addr  = {.chunk = best_fit_chunk, .offset = block->offset };
+    block->offset += size;
+    block->size -= size;
+    if (block->size == 0) {
+        // remove block if empty
+        ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
+    }
+
+    AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(alloc, addr, tensor);
+    size_t cur_max = addr.offset + size;
+    if (cur_max > alloc->max_size[addr.chunk]) {
+        // sort allocated_tensors by chunk/offset
+        for (int i = 0; i < 1024; i++) {
+            for (int j = i + 1; j < 1024; j++) {
+                if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
+                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
+                    struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
+                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
+                    alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
+                    alloc->allocated_tensors[j].tensor = tmp_tensor;
+                    alloc->allocated_tensors[j].addr = tmp_addr;
+                }
+            }
+        }
+        GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
        for (int i = 0; i < 1024; i++) {
            if (alloc->allocated_tensors[i].tensor) {
-                GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
-                    alloc->allocated_tensors[i].offset,
-                    alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
+                GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                    alloc->allocated_tensors[i].addr.chunk,
+                    alloc->allocated_tensors[i].addr.offset,
+                    alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
            }
        }
@@ -213,78 +296,69 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
    }
 #endif

-    alloc->max_size = MAX(alloc->max_size, offset + size);
+    chunk->max_size = MAX(chunk->max_size, addr.offset + size);

-    return offset;
+    return addr;

    GGML_UNUSED(tensor);
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
    size = aligned_offset(NULL, size, alloc->alignment);

-    AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);

 #ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, offset, tensor);
+    remove_allocated_tensor(alloc, addr, tensor);
 #endif

+    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
+
    // see if we can merge with an existing block
-    for (int i = 0; i < alloc->n_free_blocks; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
+    for (int i = 0; i < chunk->n_free_blocks; i++) {
+        struct free_block * block = &chunk->free_blocks[i];
        // check if ptr is at the end of the block
-        if (block->offset + block->size == offset) {
+        if (block->offset + block->size == addr.offset) {
            block->size += size;
            // check if we can merge with the next block
-            if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
-                block->size += alloc->free_blocks[i+1].size;
-                alloc->n_free_blocks--;
-                for (int j = i+1; j < alloc->n_free_blocks; j++) {
-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+            if (i < chunk->n_free_blocks - 1) {
+                struct free_block * next = &chunk->free_blocks[i+1];
+                if (block->offset + block->size == next->offset) {
+                    block->size += next->size;
+                    ggml_dyn_tallocr_remove_block(chunk, i+1);
                }
            }
            return;
        }
        // check if ptr is at the beginning of the block
-        if (offset + size == block->offset) {
-            block->offset = offset;
+        if (addr.offset + size == block->offset) {
+            block->offset = addr.offset;
            block->size += size;
            // check if we can merge with the previous block
-            if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
-                alloc->free_blocks[i-1].size += block->size;
-                alloc->n_free_blocks--;
-                for (int j = i; j < alloc->n_free_blocks; j++) {
-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+            if (i > 0) {
+                struct free_block * prev = &chunk->free_blocks[i-1];
+                if (prev->offset + prev->size == block->offset) {
+                    prev->size += block->size;
+                    ggml_dyn_tallocr_remove_block(chunk, i);
                }
            }
            return;
        }
    }
    // otherwise, add a new block
-    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
-    int insert_pos = 0;
-    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
-        insert_pos++;
-    }
-    // shift all blocks from insert_pos onward to make room for the new block
-    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
-        alloc->free_blocks[i] = alloc->free_blocks[i-1];
-    }
-    // insert the new block
-    alloc->free_blocks[insert_pos].offset = offset;
-    alloc->free_blocks[insert_pos].size = size;
-    alloc->n_free_blocks++;
+    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);

    GGML_UNUSED(tensor);
 }

 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
-    alloc->n_free_blocks = 1;
-    alloc->free_blocks[0].offset = 0;
-    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
-    alloc->max_size = 0;
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
+        free(alloc->chunks[i]);
+        alloc->chunks[i] = NULL;
+    }
+    alloc->n_chunks = 0;

 #ifdef GGML_ALLOCATOR_DEBUG
    for (int i = 0; i < 1024; i++) {
@@ -293,14 +367,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
 #endif
 }

-static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
+static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
    struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));

    *alloc = (struct ggml_dyn_tallocr) {
-        /*.alignment     = */ alignment,
-        /*.n_free_blocks = */ 0,
-        /*.free_blocks   = */ {{0}},
-        /*.max_size      = */ 0,
+        /*.alignment      = */ alignment,
+        /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
+        /*.chunks         = */ {NULL},
+        /*.n_chunks       = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ {{0}},
 #endif
@@ -312,11 +386,79 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
 }

 static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
+    for (int i = 0; i < alloc->n_chunks; ++i) {
+        free(alloc->chunks[i]);
+    }
    free(alloc);
 }

 static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
-    return alloc->max_size;
+    size_t max_size = 0;
+    for (int i = 0; i < alloc->n_chunks; i++) {
+        max_size += alloc->chunks[i]->max_size;
+    }
+    return max_size;
+}
+
+
+// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
+
+struct vbuffer {
+    ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
+};
+
+static void ggml_vbuffer_free(struct vbuffer * buf) {
+    if (buf == NULL) {
+        return;
+    }
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
+        ggml_backend_buffer_free(buf->chunks[i]);
+    }
+    free(buf);
+}
+
+static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
+    int n = 0;
+    while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
+    return n;
+}
+
+static size_t ggml_vbuffer_size(struct vbuffer * buf) {
+    size_t size = 0;
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
+        size += ggml_backend_buffer_get_size(buf->chunks[i]);
+    }
+    return size;
+}
+
+static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
+    struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
+    if (buf == NULL) {
+        return NULL;
+    }
+
+    for (int n = 0; n < talloc->n_chunks; n++) {
+        size_t chunk_size = talloc->chunks[n]->max_size;
+        buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
+        if (buf->chunks[n] == NULL) {
+            ggml_vbuffer_free(buf);
+            return NULL;
+        }
+        ggml_backend_buffer_set_usage(buf->chunks[n], usage);
+    }
+    return buf;
+}
+
+static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
+    void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
+    void * addr = (char *)base + buf_addr.offset;
+    ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
+}
+
+static void ggml_vbuffer_reset(struct vbuffer * buf) {
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
+        ggml_backend_buffer_reset(buf->chunks[i]);
+    }
 }


@@ -328,13 +470,13 @@ struct hash_node {
    int n_children;
    int n_views;
    int buffer_id;
-    size_t offset; // offset within the buffer
+    struct buffer_address addr;
    bool allocated;
 };

 struct tensor_alloc {
    int buffer_id;
-    size_t offset;
+    struct buffer_address addr;
    size_t size_max; // 0 = pre-allocated, unused, or view
 };

@@ -349,7 +491,7 @@ struct node_alloc {

 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
-    ggml_backend_buffer_t * buffers; // [n_buffers]
+    struct vbuffer ** buffers; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;

@@ -370,7 +512,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
    GGML_ASSERT(galloc->bufts != NULL);

-    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
+    galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
    GGML_ASSERT(galloc->buffers != NULL);

    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -390,7 +532,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs

        if (galloc->buf_tallocs[i] == NULL) {
            size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
-            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+            size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
+            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
        }
    }
    galloc->n_buffers = n_bufs;
@@ -418,7 +561,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
                }
            }
            if (!freed) {
-                ggml_backend_buffer_free(galloc->buffers[i]);
+                ggml_vbuffer_free(galloc->buffers[i]);
            }
        }
        if (galloc->buf_tallocs != NULL) {
@@ -467,7 +610,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor

    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
        hn->allocated = true;
-        assert(hn->offset == 0);
+        assert(hn->addr.offset == 0);

        // try to reuse a parent's buffer (inplace)
        if (ggml_op_can_inplace(node->op)) {
@@ -501,9 +644,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                            assert(view_src_hn->offset == p_hn->offset);
+                            assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
                            hn->buffer_id = p_hn->buffer_id;
-                            hn->offset = p_hn->offset;
+                            hn->addr = p_hn->addr;
                            p_hn->allocated = false; // avoid freeing the parent
                            view_src_hn->allocated = false;
                            return;
@@ -511,7 +654,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                    } else {
                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                        hn->buffer_id = p_hn->buffer_id;
-                        hn->offset = p_hn->offset;
+                        hn->addr = p_hn->addr;
                        p_hn->allocated = false; // avoid freeing the parent
                        return;
                    }
@@ -522,9 +665,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
        struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
        ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
        size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-        size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
        hn->buffer_id = buffer_id;
-        hn->offset = offset;
+        hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
    }
 }

@@ -536,12 +678,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
    }

    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-    size_t offset = hn->offset;
    int buffer_id = hn->buffer_id;
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-    ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
+    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
    hn->allocated = false;
 }

@@ -692,24 +833,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
        if (node->view_src || node->data) {
            node_alloc->dst.buffer_id = -1;
-            node_alloc->dst.offset = SIZE_MAX;
+            node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
            node_alloc->dst.size_max = 0;
        } else {
            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
            node_alloc->dst.buffer_id = hn->buffer_id;
-            node_alloc->dst.offset    = hn->offset;
+            node_alloc->dst.addr = hn->addr;
            node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
        }
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (!src || src->view_src || src->data) {
                node_alloc->src[j].buffer_id = -1;
-                node_alloc->src[j].offset = SIZE_MAX;
+                node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
                node_alloc->src[j].size_max = 0;
            } else {
                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
                node_alloc->src[j].buffer_id = hn->buffer_id;
-                node_alloc->src[j].offset   = hn->offset;
+                node_alloc->src[j].addr = hn->addr;
                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
            }
        }
@@ -725,11 +866,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
        if (leaf->view_src || leaf->data) {
            galloc->leaf_allocs[i].leaf.buffer_id = -1;
-            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
+            galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
            galloc->leaf_allocs[i].leaf.size_max = 0;
        } else {
            galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
-            galloc->leaf_allocs[i].leaf.offset = hn->offset;
+            galloc->leaf_allocs[i].leaf.addr = hn->addr;
            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
        }
    }
@@ -744,7 +885,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            }
        }

-        size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
+        size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
        size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);

        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
@@ -753,13 +894,12 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif

-            ggml_backend_buffer_free(galloc->buffers[i]);
-            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+            ggml_vbuffer_free(galloc->buffers[i]);
+            galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
            if (galloc->buffers[i] == NULL) {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                return false;
            }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
        }
    }

@@ -772,11 +912,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {

 static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
    int buffer_id = tensor_alloc->buffer_id;
-    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
+    assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);

    if (tensor->view_src != NULL) {
        if (tensor->buffer == NULL) {
-            assert(tensor_alloc->offset == SIZE_MAX);
+            assert(tensor_alloc->addr.offset == SIZE_MAX);
            if (tensor->view_src->buffer == NULL) {
                // this tensor was allocated without ggml-backend
                return;
@@ -785,11 +925,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
        }
    } else {
        if (tensor->data == NULL) {
-            assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
-            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
-            void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
+            assert(tensor_alloc->addr.offset != SIZE_MAX);
+            assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
+            ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
        } else {
            if (tensor->buffer == NULL) {
                // this tensor was allocated without ggml-backend
@@ -874,7 +1012,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
    // reset buffers
    for (int i = 0; i < galloc->n_buffers; i++) {
        if (galloc->buffers[i] != NULL) {
-            ggml_backend_buffer_reset(galloc->buffers[i]);
+            ggml_vbuffer_reset(galloc->buffers[i]);
        }
    }

@@ -917,7 +1055,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
        }
    }

-    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
+    return ggml_vbuffer_size(galloc->buffers[buffer_id]);
 }

 // utils
@@ -1793,6 +1793,14 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
    return sched->backends[i];
 }

+ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    GGML_ASSERT(sched);
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+    return sched->bufts[backend_index];
+}
+
 size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
    GGML_ASSERT(sched);
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
@@ -160,7 +160,6 @@
 #define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
 #define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128

    return ((v4f32)res)[0];
 }
+
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = __lsx_vsigncov_b(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = __lsx_vsigncov_b(x, y);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = lsx_maddubs_h(ax, sy);
+    const __m128i ones = __lsx_vreplgr2vr_h(1);
+    return lsx_madd_h(ones, dot);
+}
 #endif

 #if defined(__loongarch_asx)
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
    }
 }

-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = __lsx_vsigncov_b(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = __lsx_vsigncov_b(x, y);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = lsx_maddubs_h(ax, sy);
-    const __m128i ones = __lsx_vreplgr2vr_h(1);
-    return lsx_madd_h(ones, dot);
-}
-
 // horizontally add 8 floats
 static inline float hsum_float_8(const __m256 x) {
    __m128 res = lasx_extractf128(x, 1);
@@ -260,6 +260,101 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }

+void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_MXFP4 == 0);
+    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
+
+    const int qk = QK_MXFP4;
+    const int nb = n / qk;
+
+    const block_mxfp4 * GGML_RESTRICT x = vx;
+    const block_q8_0  * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0.0f;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    const int8x16_t  v_k = vec_xl(0, kvalues_mxfp4);
+    const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
+
+    float32x4_t v_acc = vec_splats(0.0f);
+
+    #pragma GCC unroll 8
+    for (; ib + 1 < nb; ib += 2) {
+        const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0  * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_0  * GGML_RESTRICT y1 = &y[ib + 1];
+
+        const uint8x16_t v_x0 = vec_xl(0, x0->qs);
+        const uint8x16_t v_x1 = vec_xl(0, x1->qs);
+
+        int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
+        int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
+        int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
+        int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
+
+        v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
+        v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
+        v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
+        v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
+
+        const int8x16_t v_y0l = vec_xl(0,       y0->qs);
+        const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
+        const int8x16_t v_y1l = vec_xl(0,       y1->qs);
+        const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
+
+        const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
+        const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
+
+        const float32x4_t v_xy0f = vec_float(v_xy0);
+        const float32x4_t v_xy1f = vec_float(v_xy1);
+
+        const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
+        const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
+
+        v_acc = vec_madd(v_xy0f, v_d0, v_acc);
+        v_acc = vec_madd(v_xy1f, v_d1, v_acc);
+    }
+
+    for (; ib < nb; ++ib) {
+        const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q8_0  * GGML_RESTRICT y0 = &y[ib + 0];
+
+        const uint8x16_t v_x = vec_xl(0, x0->qs);
+
+        int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
+        int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
+
+        v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
+        v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
+
+        const int8x16_t v_yl = vec_xl(0,       y0->qs);
+        const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
+
+        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
+        const float32x4_t v_xyf = vec_float(v_xy);
+
+        const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
+        v_acc = vec_madd(v_xyf, v_d, v_acc);
+    }
+
+    sumf = vec_hsum_f32x4(v_acc);
+    *s = sumf;
+#else
+    UNUSED(x);
+    UNUSED(y);
+    UNUSED(ib);
+    UNUSED(sumf);
+    ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -473,10 +473,10 @@ struct ggml_threadpool {
 struct ggml_compute_state {
 #ifndef GGML_USE_OPENMP
    ggml_thread_t thrd;
-    bool cpumask[GGML_MAX_N_THREADS];
    int  last_graph;
    bool pending;
 #endif
+    bool cpumask[GGML_MAX_N_THREADS];
    struct ggml_threadpool * threadpool;
    int ith;
 };
@@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(

    threadpool->workers = workers;

-#ifndef GGML_USE_OPENMP
+#ifdef GGML_USE_OPENMP
+    int32_t cpumask_iter = 0;
+
+    // Compute CPU masks for each thread
+    for (int j = 0; j < tpp->n_threads; j++) {
+        ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+    }
+#else // GGML_USE_OPENMP
    ggml_mutex_init(&threadpool->mutex);
    ggml_cond_init(&threadpool->cond);

@@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
                atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
            }

-            ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
+            // Apply thread CPU mask and priority
+            int ith = omp_get_thread_num();
+
+            ggml_thread_apply_priority(threadpool->prio);
+            if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
+                ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
+            }
+            ggml_graph_compute_thread(&threadpool->workers[ith]);
        }
    } else {
        atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F32_EPR  4

 #define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    __lsx_vldi(0)
-#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
+#define GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
+#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
 #define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
@@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
+    const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88);                                     \
    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
@@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);

-    return __lsx_vld(tmp, 0);
+    return (__m128)__lsx_vld(tmp, 0);
 }

 static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
@@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 }

 #define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        __lsx_vldi(0)
-#define GGML_F32Cx4_SET1(x)     __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32Cx4_LOAD(x)     __lsx_f16x4_load(x)
+#define GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
+#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
 #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
 #define GGML_F32Cx4_FMA         GGML_F32x4_FMA
 #define GGML_F32Cx4_ADD         __lsx_vfadd_s
@@ -54,7 +54,7 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
    const uint32_t i2  = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
    const uint32_t i3  = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);

-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
+    if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
        return;
    }

@@ -45,6 +45,7 @@
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/mean.cuh"
 #include "ggml-cuda/tsembd.cuh"
+#include "ggml-cuda/topk-moe.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
 #include "ggml-cuda/wkv.cuh"
@@ -2825,6 +2826,44 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
    GGML_ASSERT(unary_ops.size() == num_unary);
 #endif

+    //TODO: remove special case once ggml_can_fuse can handle empty nodes
+    std::initializer_list<enum ggml_op> topk_moe_ops           = ggml_cuda_topk_moe_ops(false);
+    std::initializer_list<enum ggml_op> topk_moe_ops_with_norm = ggml_cuda_topk_moe_ops(true);
+
+    if (ops.size() == topk_moe_ops_with_norm.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops_with_norm.begin())) {
+
+        if (node_idx + topk_moe_ops_with_norm.size() > (size_t)cgraph->n_nodes) {
+            return false;
+        }
+
+        for (size_t i = 0; i < topk_moe_ops_with_norm.size(); i++) {
+            if (cgraph->nodes[node_idx + i]->op != topk_moe_ops_with_norm.begin()[i]) return false;
+        }
+        ggml_tensor * softmax = cgraph->nodes[node_idx];
+        ggml_tensor * weights = cgraph->nodes[node_idx+8];
+
+        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+            return true;
+        }
+    }
+
+    if (ops.size() == topk_moe_ops.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops.begin())) {
+
+        if (node_idx + topk_moe_ops.size() > (size_t)cgraph->n_nodes) {
+            return false;
+        }
+
+        for (size_t i = 0; i < topk_moe_ops.size(); i++) {
+            if (cgraph->nodes[node_idx + i]->op != topk_moe_ops.begin()[i]) return false;
+        }
+
+        ggml_tensor * softmax = cgraph->nodes[node_idx];
+        ggml_tensor * weights = cgraph->nodes[node_idx+4];
+        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+            return true;
+        }
+    }
+
    if (!ggml_can_fuse(cgraph, node_idx, ops)) {
        return false;
    }
@@ -2915,6 +2954,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
                if (!disable_fusion) {

+                    if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
+                        ggml_tensor * weights = cgraph->nodes[i+8];
+                        ggml_tensor * selected_experts = cgraph->nodes[i+3];
+                        ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ true);
+                        i += 8;
+                        continue;
+                    }
+
+                    if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
+                        ggml_tensor * weights = cgraph->nodes[i+4];
+                        ggml_tensor * selected_experts = cgraph->nodes[i+3];
+                        ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false);
+                        i += 4;
+                        continue;
+                    }
+
                    if (node->op == GGML_OP_ADD) {
                        int n_fuse = 0;
                        ggml_op ops[8];
@@ -81,7 +81,7 @@ static __global__ void mmq_ids_helper(
 #pragma unroll
            for (int offset = neu_padded; offset < warp_size; offset += neu_padded) {
                const int tmp = __shfl_up_sync(0xFFFFFFFF, it_compact_add_self, offset, warp_size);
-                if (threadIdx.x >= offset) {
+                if (threadIdx.x >= static_cast<unsigned int>(offset)) {
                    it_compact_add_lower += tmp;
                }
            }
@@ -110,7 +110,7 @@ static __global__ void mmq_ids_helper(

    expert_bounds[expert] = nex_prev;

-    if (expert < gridDim.x - 1) {
+    if (expert < static_cast<int>(gridDim.x) - 1) {
        return;
    }

@@ -220,7 +220,7 @@ static __global__ void mul_mat_vec_q(
            tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
        }

-        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) {
+        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
            dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
        }
    }
@@ -51,6 +51,8 @@ static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
    }
    const float value               = *(const float *) (src0_ptr + src_idx * nb00);
    *(float *) (dst_ptr + i0 * nb0) = value;
+
+    GGML_UNUSED(p1);
 }

 void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -0,0 +1,259 @@
+#include "ggml-cuda/common.cuh"
+#include "ggml.h"
+#include "topk-moe.cuh"
+
+#include <initializer_list>
+
+/*
+    This kernel does the following:
+    1. softmax over the logits per token [n_experts, n_tokens]
+    2. argmax reduce over the top-k (n_experts_used) logits
+    3. write weights + ids to global memory
+    4. optionally normalize the weights
+
+    It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
+*/
+template <size_t n_experts, bool with_norm>
+__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
+                                                                  float *       weights,
+                                                                  int32_t *     ids,
+                                                                  const int     n_rows,
+                                                                  const int     n_expert_used) {
+    const int row = blockIdx.x * blockDim.y + threadIdx.y;
+    if (row >= n_rows) {
+        return;
+    }
+
+    logits += n_experts * row;
+    weights += n_expert_used * row;
+    ids += n_experts * row;
+
+    constexpr int experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
+
+    float logits_r[experts_per_thread];
+
+#pragma unroll
+    for (int i = 0; i < n_experts; i += WARP_SIZE) {
+        const int expert        = i + threadIdx.x;
+        logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[expert] : -INFINITY;
+    }
+
+    float max_val = logits_r[0];
+
+#pragma unroll
+    for (int i = 1; i < experts_per_thread; i++) {
+        const float val = logits_r[i];
+        max_val         = max(val, max_val);
+    }
+
+    max_val = warp_reduce_max(max_val);
+
+    float wt[experts_per_thread];
+    float tmp = 0.f;
+
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        const float val = logits_r[i];
+        wt[i]           = expf(val - max_val);
+        tmp += wt[i];
+    }
+
+    tmp = warp_reduce_sum(tmp);
+
+    const float inv_sum = 1.0f / tmp;
+
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        wt[i] = wt[i] * inv_sum;
+    }
+
+    //at this point, each thread holds a portion of softmax,
+    //we do the argmax reduce over n_expert_used, each time marking
+    //the expert weight as -inf to exclude from the next iteration
+
+    float wt_sum = 0.f;
+
+    extern __shared__ float data_topk_shared[];
+    float *                 wt_shared_ptr = data_topk_shared + threadIdx.y * n_expert_used;
+
+    for (int k = 0; k < n_expert_used; k++) {
+        float max_val    = wt[0];
+        int   max_expert = threadIdx.x;
+
+#pragma unroll
+        for (int i = 1; i < experts_per_thread; i++) {
+            const int expert = threadIdx.x + i * WARP_SIZE;
+            if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
+                max_val    = wt[i];
+                max_expert = expert;
+            }
+        }
+
+#pragma unroll
+        for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
+            const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
+            const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
+            if (val > max_val || (val == max_val && expert < max_expert)) {
+                max_val    = val;
+                max_expert = expert;
+            }
+        }
+
+        if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
+            wt[max_expert / WARP_SIZE] = -INFINITY;
+
+            wt_shared_ptr[k] = max_val;
+            ids[k]           = max_expert;
+            if constexpr (with_norm) {
+                wt_sum += max_val;
+            }
+        }
+    }
+
+    if constexpr (with_norm) {
+        wt_sum              = warp_reduce_sum(wt_sum);
+        const float inv_sum = 1.0f / wt_sum;
+
+        for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
+            wt_shared_ptr[i] = wt_shared_ptr[i] * inv_sum;
+        }
+    }
+
+    for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
+        weights[i] = wt_shared_ptr[i];
+    }
+}
+
+template <bool with_norm>
+static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
+                                 const float *               logits,
+                                 float *                     weights,
+                                 int32_t *                   ids,
+                                 const int                   n_rows,
+                                 const int                   n_expert,
+                                 const int                   n_expert_used) {
+    const int    rows_per_block = 4;
+    dim3         grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
+    dim3         block_dims(WARP_SIZE, rows_per_block, 1);
+    cudaStream_t stream = ctx.stream();
+
+    const int nbytes_shared = n_expert_used * rows_per_block * sizeof(float);
+
+    switch (n_expert) {
+        case 1:
+            topk_moe_cuda<1, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 2:
+            topk_moe_cuda<2, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 4:
+            topk_moe_cuda<4, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 8:
+            topk_moe_cuda<8, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 16:
+            topk_moe_cuda<16, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 32:
+            topk_moe_cuda<32, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 64:
+            topk_moe_cuda<64, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 128:
+            topk_moe_cuda<128, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 256:
+            topk_moe_cuda<256, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 512:
+            topk_moe_cuda<512, with_norm>
+                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        default:
+            GGML_ASSERT(false && "fatal error");
+            break;
+    }
+}
+
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
+                           const ggml_tensor *         logits,
+                           ggml_tensor *               weights,
+                           ggml_tensor *               ids,
+                           const bool                  with_norm) {
+    GGML_ASSERT(logits->type == GGML_TYPE_F32);
+    GGML_ASSERT(weights->type == GGML_TYPE_F32);
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+
+    const int n_experts = logits->ne[0];
+    const int n_rows    = logits->ne[1];
+
+    const float * logits_d  = (const float *) logits->src[0]->data;
+    float *       weights_d = (float *) weights->data;
+    int32_t *     ids_d     = (int32_t *) ids->data;
+
+    GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
+
+    cudaStream_t stream = ctx.stream();
+
+    const int n_expert_used = weights->ne[1];
+
+    if (with_norm) {
+        launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
+    } else {
+        launch_topk_moe_cuda<false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
+    }
+}
+
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights) {
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
+
+    if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
+        return false;
+    }
+
+    if (scale != 1.0f || max_bias != 0.0f) {
+        return false;
+    }
+
+    // don't fuse when masks or sinks are present
+    if (softmax->src[1] || softmax->src[2]) {
+        return false;
+    }
+
+    const int n_expert = softmax->ne[0];
+    // n_expert must be a power of 2
+    if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
+        return false;
+    }
+
+    return true;
+}
+
+std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm) {
+    static std::initializer_list<enum ggml_op> norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
+                                                            GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
+                                                            GGML_OP_SUM_ROWS, GGML_OP_DIV,      GGML_OP_RESHAPE };
+
+    static std::initializer_list<enum ggml_op> no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
+                                                               GGML_OP_VIEW, GGML_OP_GET_ROWS };
+
+    if (norm) {
+        return norm_ops;
+    }
+    return no_norm_ops;
+}
@@ -0,0 +1,14 @@
+#include "common.cuh"
+#include "ggml.h"
+
+#include <initializer_list>
+
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
+                           const ggml_tensor *         logits,
+                           ggml_tensor *               weights,
+                           ggml_tensor *               top_k,
+                           const bool                  with_norm);
+
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights);
+
+std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm);
@@ -342,6 +342,10 @@ struct ggml_cgraph {
 // if you need the gradients, get them from the original graph
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);

+// ggml-alloc.c: true if the operation can reuse memory from its sources
+GGML_API bool ggml_op_can_inplace(enum ggml_op op);
+
+
 // Memory allocation

 GGML_API void * ggml_aligned_malloc(size_t size);
@@ -256,8 +256,6 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node

    // perform reorders only across these types of ops
    // can be expanded when needed
-    // IMPORTANT: do not add ops such as GGML_OP_CPY or GGML_OP_SET_ROWS
-    //            the dependencies from such ops are not always represented in the graph
    const auto & h_safe = [](ggml_op op) {
        switch (op) {
            case GGML_OP_MUL_MAT:
@@ -273,6 +271,8 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
            case GGML_OP_GLU:
            case GGML_OP_SCALE:
            case GGML_OP_GET_ROWS:
+            case GGML_OP_CPY:
+            case GGML_OP_SET_ROWS:
                return true;
            default:
                return ggml_op_is_empty(op);
@@ -383,6 +383,7 @@ void ggml_graph_optimize(ggml_cgraph * gf) {
        // fuse only ops that start with these operations
        // can be expanded when needed
        if (node.op() == GGML_OP_ADD ||
+            node.op() == GGML_OP_NORM ||
            node.op() == GGML_OP_RMS_NORM) {
            ops[0] = node.op();

@@ -392,6 +393,7 @@ void ggml_graph_optimize(ggml_cgraph * gf) {
                // can be expanded when needed
                if (gf->nodes[f]->op != GGML_OP_ADD &&
                    gf->nodes[f]->op != GGML_OP_MUL &&
+                    gf->nodes[f]->op != GGML_OP_NORM &&
                    gf->nodes[f]->op != GGML_OP_RMS_NORM) {
                    break;
                }
@@ -222,7 +222,28 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
        ctx->cmd_buf_last = nil;
    }

-    // release any completed command buffers
+    // check status of all command buffers
+    {
+        const int n_cb = ctx->n_cb;
+
+        for (int cb_idx = 0; cb_idx <= n_cb; ++cb_idx) {
+            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
+            if (!cmd_buf) {
+                continue;
+            }
+
+            MTLCommandBufferStatus status = [cmd_buf status];
+            if (status != MTLCommandBufferStatusCompleted) {
+                GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, cb_idx, (int) status);
+                if (status == MTLCommandBufferStatusError) {
+                    GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
+                }
+                GGML_ABORT("fatal error");
+            }
+        }
+    }
+
+    // release any completed extra command buffers
    if (ctx->cmd_bufs_ext.count > 0) {
        for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
@@ -260,6 +281,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
                                                         length:size
                                                        options:MTLResourceStorageModeShared];

+        GGML_ASSERT(buf_src);
+
        struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(tensor);
        if (bid_dst.metal == nil) {
            GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
@@ -299,6 +322,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
                                                              options:MTLResourceStorageModeShared
                                                          deallocator:nil];

+        GGML_ASSERT(buf_dst);
+
        struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(tensor);
        if (bid_src.metal == nil) {
            GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
@@ -1090,36 +1090,6 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin(
    return res;
 }

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rms_norm(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {
-    assert(op->op == GGML_OP_RMS_NORM);
-
-    GGML_ASSERT(op->src[0]->ne[0] % 4 == 0);
-    GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
-
-    char base[256];
-    char name[256];
-
-    switch (n_fuse) {
-        case 1: snprintf(base, 256, "kernel_rms_norm_f32");         break;
-        case 2: snprintf(base, 256, "kernel_rms_norm_mul_f32");     break;
-        case 3: snprintf(base, 256, "kernel_rms_norm_mul_add_f32"); break;
-        default: GGML_ABORT("fatal error");
-    }
-
-    snprintf(name, 256, "%s", base);
-
-    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
-    if (res) {
-        return res;
-    }
-
-    res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
-
-    ggml_metal_pipeline_set_smem(res, 32*sizeof(float));
-
-    return res;
-}
-
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm(ggml_metal_library_t lib, const ggml_tensor * op) {
    assert(op->op == GGML_OP_L2_NORM);

@@ -1167,16 +1137,37 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm(ggml_metal_libr
    return res;
 }

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm(ggml_metal_library_t lib, const ggml_tensor * op) {
-    assert(op->op == GGML_OP_NORM);
+ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm(ggml_metal_library_t lib, const ggml_tensor * op, int n_fuse) {
+    assert(op->op == GGML_OP_NORM || op->op == GGML_OP_RMS_NORM);

-    GGML_ASSERT(op->src[0]->ne[0] % 4 == 0);
-    GGML_ASSERT(ggml_is_contiguous_1(op->src[0]));
+    GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));

    char base[256];
    char name[256];

-    snprintf(base, 256, "kernel_norm_f32");
+    const char * suffix = "";
+    if (op->ne[0] % 4 == 0) {
+        suffix = "_4";
+    }
+
+    switch (op->op) {
+        case GGML_OP_NORM:
+            switch (n_fuse) {
+                case 1: snprintf(base, 256, "kernel_norm_f32%s", suffix);         break;
+                case 2: snprintf(base, 256, "kernel_norm_mul_f32%s", suffix);     break;
+                case 3: snprintf(base, 256, "kernel_norm_mul_add_f32%s", suffix); break;
+                default: GGML_ABORT("fatal error");
+            } break;
+        case GGML_OP_RMS_NORM:
+            switch (n_fuse) {
+                case 1: snprintf(base, 256, "kernel_rms_norm_f32%s", suffix);         break;
+                case 2: snprintf(base, 256, "kernel_rms_norm_mul_f32%s", suffix);     break;
+                case 3: snprintf(base, 256, "kernel_rms_norm_mul_add_f32%s", suffix); break;
+                default: GGML_ABORT("fatal error");
+            } break;
+        default: GGML_ABORT("fatal error");
+    }
+
    snprintf(name, 256, "%s", base);

    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
@@ -1237,7 +1228,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col(ggml_metal_library_
    char base[256];
    char name[256];

-    snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type));
+    snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
    snprintf(name, 256, "%s", base);

    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
@@ -123,10 +123,9 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_id         (ggml_me
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argmax            (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort           (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin               (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rms_norm          (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm           (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm        (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm              (ggml_metal_library_t lib, const struct ggml_tensor * op);
+ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm              (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col            (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
@@ -661,13 +661,13 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
        case GGML_OP_SOFT_MAX:
        case GGML_OP_GROUP_NORM:
            return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
-        case GGML_OP_RMS_NORM:
        case GGML_OP_L2_NORM:
            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
        case GGML_OP_ARGMAX:
            return has_simdgroup_reduction;
        case GGML_OP_NORM:
-            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
+        case GGML_OP_RMS_NORM:
+            return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
        case GGML_OP_ROPE:
            return true;
        case GGML_OP_IM2COL:
@@ -1176,6 +1176,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
                                                              options:MTLResourceStorageModeShared
                                                          deallocator:nil];

+        GGML_ASSERT(buf_src);
+
        // dst
        struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
        bid_dst.offs += offset;
@@ -1232,6 +1234,8 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
                                                              options:MTLResourceStorageModeShared
                                                          deallocator:nil];

+        GGML_ASSERT(buf_dst);
+
        id<MTLCommandQueue>  queue   = buf->queue;
        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];

@@ -428,16 +428,11 @@ typedef struct {
    uint64_t nb1;
 } ggml_metal_kargs_mul_mv_id;

+// NORM
+// RMS_NORM
 typedef struct {
    int32_t  ne00;
-    int32_t  ne00_4;
-    uint64_t nb01;
-    float    eps;
-} ggml_metal_kargs_norm;
-
-typedef struct {
-    int32_t  ne00;
-    int32_t  ne00_4;
+    int32_t  ne00_t;
    uint64_t nb1;
    uint64_t nb2;
    uint64_t nb3;
@@ -448,7 +443,7 @@ typedef struct {
    uint64_t nbf1[3];
    uint64_t nbf2[3];
    uint64_t nbf3[3];
-} ggml_metal_kargs_rms_norm;
+} ggml_metal_kargs_norm;

 typedef struct {
    int32_t  ne00;
@@ -266,10 +266,6 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
            {
                n_fuse = ggml_metal_op_set_rows(ctx, idx);
            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                n_fuse = ggml_metal_op_rms_norm(ctx, idx);
-            } break;
        case GGML_OP_L2_NORM:
            {
                n_fuse = ggml_metal_op_l2_norm(ctx, idx);
@@ -279,6 +275,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
                n_fuse = ggml_metal_op_group_norm(ctx, idx);
            } break;
        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
            {
                n_fuse = ggml_metal_op_norm(ctx, idx);
            } break;
@@ -2346,146 +2343,6 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
    return n_fuse;
 }

-int ggml_metal_op_rms_norm(ggml_metal_op_t ctx, int idx) {
-    ggml_cgraph * gf = ctx->gf;
-    ggml_tensor * op = ggml_graph_node(gf, idx);
-
-    ggml_metal_library_t lib = ctx->lib;
-    ggml_metal_encoder_t enc = ctx->enc;
-
-    const int idx_end = ctx->idx_end;
-
-    const bool use_fusion = ctx->use_fusion;
-
-    const int debug_fusion = ctx->debug_fusion;
-
-    ggml_tensor ** ops = ggml_graph_nodes(gf) + idx;
-
-    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
-    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
-    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
-
-    float eps;
-    memcpy(&eps, op->op_params, sizeof(float));
-
-    ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
-    ggml_metal_buffer_id bid_dst  = ggml_metal_get_buffer_id(op);
-
-    ggml_metal_kargs_rms_norm args = {
-        /*.ne00   =*/ ne00,
-        /*.ne00_4 =*/ ne00/4,
-        /*.nb1    =*/ nb1,
-        /*.nb2    =*/ nb2,
-        /*.nb3    =*/ nb3,
-        /*.eps    =*/ eps,
-        /*.nef1   =*/ { ne01 },
-        /*.nef2   =*/ { ne02 },
-        /*.nef3   =*/ { ne03 },
-        /*.nbf1   =*/ { nb01 },
-        /*.nbf2   =*/ { nb02 },
-        /*.nbf3   =*/ { nb03 },
-    };
-
-    ggml_op fops[8];
-
-    int n_fuse = 1;
-
-    ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
-
-    // d[0] = rms_norm(a)
-    // d[1] = mul(d[0], b)
-    // d[2] = add(d[1], c)
-    if (use_fusion) {
-        fops[0] = GGML_OP_RMS_NORM;
-        fops[1] = GGML_OP_MUL;
-        fops[2] = GGML_OP_ADD;
-
-        for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
-            if (!ggml_can_fuse(gf, idx + n_fuse, fops + n_fuse, 2)) {
-                break;
-            }
-
-            if (ops[n_fuse] != ops[n_fuse + 1]->src[0]) {
-                break;
-            }
-
-            if (ops[n_fuse + 1]->src[1]->ne[0] != op->ne[0]) {
-                break;
-            }
-
-            if (!ggml_is_contiguous_rows(ops[n_fuse + 1]->src[1])) {
-                break;
-            }
-
-            if (ops[n_fuse + 1]->type != GGML_TYPE_F32) {
-                break;
-            }
-
-            //ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
-
-            bid_fuse[n_fuse] = ggml_metal_get_buffer_id(ops[n_fuse + 1]->src[1]);
-
-            args.nef1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[1];
-            args.nef2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[2];
-            args.nef3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[3];
-
-            args.nbf1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[1];
-            args.nbf2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[2];
-            args.nbf3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[3];
-        }
-
-        ++n_fuse;
-
-        if (debug_fusion > 1 && n_fuse > 1) {
-            if (n_fuse == 2) {
-                GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL\n", __func__);
-            }
-            if (n_fuse == 3) {
-                GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL + ADD\n", __func__);
-            }
-        }
-    }
-
-    if (n_fuse > 1) {
-        bid_dst = ggml_metal_get_buffer_id(ops[n_fuse - 1]);
-
-        for (int i = 1; i < n_fuse; ++i) {
-            if (!ggml_metal_op_concurrency_check(ctx, ops[i])) {
-                ggml_metal_op_concurrency_reset(ctx);
-
-                break;
-            }
-        }
-    }
-
-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rms_norm(lib, op, n_fuse);
-
-    int nth = 32; // SIMD width
-
-    while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
-        nth *= 2;
-    }
-
-    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
-    nth = std::min(nth, ne00/4);
-
-    const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
-
-    ggml_metal_encoder_set_pipeline(enc, pipeline);
-    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-    ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
-    ggml_metal_encoder_set_buffer  (enc, bid_fuse[0], 2);
-    ggml_metal_encoder_set_buffer  (enc, bid_fuse[1], 3);
-    ggml_metal_encoder_set_buffer  (enc, bid_dst, 4);
-
-    ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
-
-    ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
-
-    return n_fuse;
-}
-
 int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
    ggml_cgraph * gf = ctx->gf;
    ggml_tensor * op = ggml_graph_node(gf, idx);
@@ -2594,6 +2451,14 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
    ggml_metal_library_t lib = ctx->lib;
    ggml_metal_encoder_t enc = ctx->enc;

+    const int idx_end = ctx->idx_end;
+
+    const bool use_fusion = ctx->use_fusion;
+
+    const int debug_fusion = ctx->debug_fusion;
+
+    ggml_tensor ** ops = ggml_graph_nodes(gf) + idx;
+
    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
@@ -2602,37 +2467,121 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
    float eps;
    memcpy(&eps, op->op_params, sizeof(float));

+    ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
+    ggml_metal_buffer_id bid_dst  = ggml_metal_get_buffer_id(op);
+
    ggml_metal_kargs_norm args = {
        /*.ne00   =*/ ne00,
-        /*.ne00_4 =*/ ne00/4,
-        /*.nb01   =*/ nb01,
+        /*.ne00_t =*/ ne00 % 4 == 0 ? ne00/4 : ne00,
+        /*.nb1    =*/ nb1,
+        /*.nb2    =*/ nb2,
+        /*.nb3    =*/ nb3,
        /*.eps    =*/ eps,
+        /*.nef1   =*/ { ne01 },
+        /*.nef2   =*/ { ne02 },
+        /*.nef3   =*/ { ne03 },
+        /*.nbf1   =*/ { nb01 },
+        /*.nbf2   =*/ { nb02 },
+        /*.nbf3   =*/ { nb03 },
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op);
+    ggml_op fops[8];
+
+    int n_fuse = 1;
+
+    ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
+
+    // d[0] = norm(a)
+    // d[1] = mul(d[0], b)
+    // d[2] = add(d[1], c)
+    if (use_fusion) {
+        fops[0] = op->op;
+        fops[1] = GGML_OP_MUL;
+        fops[2] = GGML_OP_ADD;
+
+        for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
+            if (!ggml_can_fuse(gf, idx + n_fuse, fops + n_fuse, 2)) {
+                break;
+            }
+
+            if (ops[n_fuse] != ops[n_fuse + 1]->src[0]) {
+                break;
+            }
+
+            if (ops[n_fuse + 1]->src[1]->ne[0] != op->ne[0]) {
+                break;
+            }
+
+            if (!ggml_is_contiguous_rows(ops[n_fuse + 1]->src[1])) {
+                break;
+            }
+
+            if (ops[n_fuse + 1]->type != GGML_TYPE_F32) {
+                break;
+            }
+
+            //ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
+
+            bid_fuse[n_fuse] = ggml_metal_get_buffer_id(ops[n_fuse + 1]->src[1]);
+
+            args.nef1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[1];
+            args.nef2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[2];
+            args.nef3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[3];
+
+            args.nbf1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[1];
+            args.nbf2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[2];
+            args.nbf3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[3];
+        }
+
+        ++n_fuse;
+
+        if (debug_fusion > 1 && n_fuse > 1) {
+            if (n_fuse == 2) {
+                GGML_LOG_DEBUG("%s: fuse: %s + MUL\n", __func__, ggml_op_name(op->op));
+            }
+            if (n_fuse == 3) {
+                GGML_LOG_DEBUG("%s: fuse: %s + MUL + ADD\n", __func__, ggml_op_name(op->op));
+            }
+        }
+    }
+
+    if (n_fuse > 1) {
+        bid_dst = ggml_metal_get_buffer_id(ops[n_fuse - 1]);
+
+        for (int i = 1; i < n_fuse; ++i) {
+            if (!ggml_metal_op_concurrency_check(ctx, ops[i])) {
+                ggml_metal_op_concurrency_reset(ctx);
+
+                break;
+            }
+        }
+    }
+
+    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);

    int nth = 32; // SIMD width
-    while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+
+    while (nth < args.ne00_t && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
        nth *= 2;
    }

    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
-    nth = std::min(nth, ne00/4);
+    nth = std::min(nth, args.ne00_t);

    const size_t smem = ggml_metal_pipeline_get_smem(pipeline);

-    const int64_t nrows = ggml_nrows(op->src[0]);
-
    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
-    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
+    ggml_metal_encoder_set_buffer  (enc, bid_src0,    1);
+    ggml_metal_encoder_set_buffer  (enc, bid_fuse[0], 2);
+    ggml_metal_encoder_set_buffer  (enc, bid_fuse[1], 3);
+    ggml_metal_encoder_set_buffer  (enc, bid_dst,     4);

    ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);

-    ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
+    ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);

-    return 1;
+    return n_fuse;
 }

 int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
@@ -2768,7 +2717,6 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
    const uint64_t ofs0 = op->src[1]->nb[is_2D ? 3 : 2] / 4;
    const uint64_t ofs1 = op->src[1]->nb[is_2D ? 2 : 1] / 4;

-
    ggml_metal_kargs_im2col args = {
        /*.ofs0 =*/ ofs0,
        /*.ofs1 =*/ ofs1,
@@ -2789,15 +2737,16 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {

    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);

-    const uint64_t n_threads = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), N);
-    const int64_t  quotient  = N / n_threads + (N % n_threads > 0 ? 1 : 0);
+    GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+
+    const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);

-    ggml_metal_encoder_dispatch_threadgroups(enc, quotient * CHW, OH, OW, n_threads, 1, 1);
+    ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);

    return 1;
 }
@@ -60,7 +60,6 @@ int ggml_metal_op_mul_mat_id        (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_add_id            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_flash_attn_ext    (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_bin               (ggml_metal_op_t ctx, int idx);
-int ggml_metal_op_rms_norm          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_l2_norm           (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_group_norm        (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_norm              (ggml_metal_op_t ctx, int idx);
@@ -66,6 +66,10 @@ static inline float e8m0_to_fp32(uint8_t x) {
    return as_type<float>(bits);
 }

+static inline float dot(float x, float y) {
+    return x*y;
+}
+
 // NOTE: this is not dequantizing - we are simply fitting the template
 template <typename type4x4>
 void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
@@ -2493,30 +2497,43 @@ kernel void kernel_argmax_f32(
    dst_i32[tgpig] = arg_val;
 }

-kernel void kernel_norm_f32(
+// F == 1 : norm (no fuse)
+// F == 2 : norm + mul
+// F == 3 : norm + mul + add
+template <typename T, short F>
+kernel void kernel_norm_fuse_impl(
        constant ggml_metal_kargs_norm & args,
        device const char * src0,
+        device const char * src1_0,
+        device const char * src1_1,
        device       char * dst,
        threadgroup float * shmem_f32 [[threadgroup(0)]],
-        uint   tgpig[[threadgroup_position_in_grid]],
-        ushort tpitg[[thread_position_in_threadgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]],
-        ushort tiisg[[thread_index_in_simdgroup]],
-        ushort   ntg[[threads_per_threadgroup]]) {
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
    if (sgitg == 0) {
        shmem_f32[tiisg] = 0.0f;
    }

-    device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
+    const int i01 = tgpig.x;
+    const int i02 = tgpig.y;
+    const int i03 = tgpig.z;

-    float4 sumf4(0.0f);
+    device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
+
+    device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
+    device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
+
+    T sumft(0.0f);

    float sumf = 0.0f;

-    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
-        sumf4 += x[i00];
+    for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
+        sumft += x[i00];
    }
-    sumf = sumf4[0] + sumf4[1] + sumf4[2] + sumf4[3];
+    sumf = dot(sumft, T(1.0f));
    sumf = simd_sum(sumf);

    threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -2532,10 +2549,10 @@ kernel void kernel_norm_f32(

    const float mean = sumf/args.ne00;

-    device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
+    device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);

    sumf = 0.0f;
-    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
+    for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
        y[i00] = x[i00] - mean;
        sumf += dot(y[i00], y[i00]);
    }
@@ -2555,17 +2572,35 @@ kernel void kernel_norm_f32(
    const float variance = sumf/args.ne00;

    const float scale = 1.0f/sqrt(variance + args.eps);
-    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
-        y[i00] = y[i00] * scale;
+    for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
+        if (F == 1) {
+            y[i00] = (y[i00]*scale);
+        }
+        if (F == 2) {
+            y[i00] = (y[i00]*scale)*f0[i00];
+        }
+        if (F == 3) {
+            y[i00] = (y[i00]*scale)*f0[i00] + f1[i00];
+        }
    }
 }

+typedef decltype(kernel_norm_fuse_impl<float4, 1>) kernel_norm_fuse_t;
+
+template [[host_name("kernel_norm_f32")]]         kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 1>;
+template [[host_name("kernel_norm_mul_f32")]]     kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 2>;
+template [[host_name("kernel_norm_mul_add_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 3>;
+
+template [[host_name("kernel_norm_f32_4")]]         kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 1>;
+template [[host_name("kernel_norm_mul_f32_4")]]     kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 2>;
+template [[host_name("kernel_norm_mul_add_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 3>;
+
 // F == 1 : rms_norm (no fuse)
 // F == 2 : rms_norm + mul
 // F == 3 : rms_norm + mul + add
-template <short F>
+template <typename T, short F>
 kernel void kernel_rms_norm_fuse_impl(
-        constant ggml_metal_kargs_rms_norm & args,
+        constant ggml_metal_kargs_norm & args,
        device const char * src0,
        device const char * src1_0,
        device const char * src1_1,
@@ -2584,15 +2619,15 @@ kernel void kernel_rms_norm_fuse_impl(
    const int i02 = tgpig.y;
    const int i03 = tgpig.z;

-    device const float4 * x = (device const float4 *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
+    device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);

-    device const float4 * f0 = (device const float4 *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
-    device const float4 * f1 = (device const float4 *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
+    device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
+    device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);

    float sumf = 0.0f;

    // parallel sum
-    for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
+    for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
        sumf += dot(x[i00], x[i00]);
    }
    sumf = simd_sum(sumf);
@@ -2611,8 +2646,8 @@ kernel void kernel_rms_norm_fuse_impl(
    const float mean  = sumf/args.ne00;
    const float scale = 1.0f/sqrt(mean + args.eps);

-    device float4 * y = (device float4 *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
-    for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
+    device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
+    for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
        if (F == 1) {
            y[i00] = (x[i00]*scale);
        }
@@ -2625,11 +2660,15 @@ kernel void kernel_rms_norm_fuse_impl(
    }
 }

-typedef decltype(kernel_rms_norm_fuse_impl<1>) kernel_rms_norm_fuse_t;
+typedef decltype(kernel_rms_norm_fuse_impl<float4, 1>) kernel_rms_norm_fuse_t;

-template [[host_name("kernel_rms_norm_f32")]]         kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<1>;
-template [[host_name("kernel_rms_norm_mul_f32")]]     kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<2>;
-template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<3>;
+template [[host_name("kernel_rms_norm_f32")]]         kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 1>;
+template [[host_name("kernel_rms_norm_mul_f32")]]     kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 2>;
+template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 3>;
+
+template [[host_name("kernel_rms_norm_f32_4")]]         kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 1>;
+template [[host_name("kernel_rms_norm_mul_f32_4")]]     kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 2>;
+template [[host_name("kernel_rms_norm_mul_add_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 3>;

 kernel void kernel_l2_norm_f32(
        constant ggml_metal_kargs_l2_norm & args,
@@ -3987,60 +4026,7 @@ template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kerne
 template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
 template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;

-// TODO: obolete -- remove
-//typedef void (im2col_t)(
-//        constant ggml_metal_kargs_im2col & args,
-//        device const float * x,
-//        device        char * dst,
-//        uint3 tgpig[[threadgroup_position_in_grid]],
-//        uint3  tgpg[[threadgroups_per_grid]],
-//        uint3 tpitg[[thread_position_in_threadgroup]],
-//        uint3   ntg[[threads_per_threadgroup]]);
-//
-//template <typename T>
-//kernel void kernel_im2col(
-//        constant ggml_metal_kargs_im2col & args,
-//        device const float * x,
-//        device        char * dst,
-//        uint3 tgpig[[threadgroup_position_in_grid]],
-//        uint3  tgpg[[threadgroups_per_grid]],
-//        uint3 tpitg[[thread_position_in_threadgroup]],
-//        uint3   ntg[[threads_per_threadgroup]]) {
-////    const int64_t IC = tgpg[0];
-//    const int64_t OH = tgpg[1];
-//    const int64_t OW = tgpg[2];
-//
-////    const int64_t N  = ntg[0];
-//    const int64_t KH = ntg[1];
-//    const int64_t KW = ntg[2];
-//
-//    const int64_t in  = tpitg[0];
-//    const int64_t ikh = tpitg[1];
-//    const int64_t ikw = tpitg[2];
-//
-//    const int64_t iic = tgpig[0];
-//    const int64_t ioh = tgpig[1];
-//    const int64_t iow = tgpig[2];
-//
-//    const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0;
-//    const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1;
-//
-//    const int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw);
-//
-//    device T * pdst = (device T *) (dst);
-//
-//    if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
-//        pdst[offset_dst] = 0.0f;
-//    } else {
-//        const int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw;
-//        pdst[offset_dst] = x[offset_src];
-//    }
-//}
-//
-//template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
-//template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
-
-typedef void (im2col_ext_t)(
+typedef void (im2col_t)(
        constant ggml_metal_kargs_im2col & args,
        device const float * x,
        device        char * dst,
@@ -4050,48 +4036,113 @@ typedef void (im2col_ext_t)(
        uint3   ntg[[threads_per_threadgroup]]);

 template <typename T>
-kernel void kernel_im2col_ext(
+kernel void kernel_im2col(
        constant ggml_metal_kargs_im2col & args,
        device const float * x,
        device        char * dst,
        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
+        uint3  tgpg[[threadgroups_per_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
-    const int64_t KHW = (int64_t)args.KHW;
+        uint3   ntg[[threads_per_threadgroup]]) {
+//    const int64_t IC = tgpg[0];
+    const int64_t OH = tgpg[1];
+    const int64_t OW = tgpg[2];

-    const int64_t d   = tgpig[0] / args.CHW;
-    const int64_t chw = tgpig[0] % args.CHW;
-    const int64_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
-    const int64_t HW = tgpig[0] % KHW;
+    const int64_t KH = ntg[1];
+    const int64_t KW = ntg[2];

-    const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
-    if (tpitg_0 >= args.N) {
-        return;
-    }
+          int64_t in  = tpitg[0];
+    const int64_t ikh = tpitg[1];
+    const int64_t ikw = tpitg[2];

-    const int64_t tpitg_1 = HW / args.KW;
-    const int64_t tpitg_2 = HW % args.KW;
+    const int64_t iic = tgpig[0];
+    const int64_t ioh = tgpig[1];
+    const int64_t iow = tgpig[2];

-    const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
-    const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
+    const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0;
+    const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1;

-    const int64_t offset_dst =
-        (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
-        (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
+    int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw);

    device T * pdst = (device T *) (dst);

    if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
-        pdst[offset_dst] = 0.0f;
+        while (in < args.N) {
+            pdst[offset_dst] = 0.0f;
+            offset_dst += ntg[0]*args.CHW*OH*OW;
+
+            in += ntg[0];
+        }
    } else {
-        const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
-        pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
+        int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw;
+
+        while (in < args.N) {
+            pdst[offset_dst] = x[offset_src];
+
+            offset_dst += ntg[0]*args.CHW*OH*OW;
+            offset_src += ntg[0]*args.ofs0;
+
+            in += ntg[0];
+        }
    }
 }

-template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
-template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
+template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
+template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
+
+// TODO: obolete -- remove
+//typedef void (im2col_ext_t)(
+//        constant ggml_metal_kargs_im2col & args,
+//        device const float * x,
+//        device        char * dst,
+//        uint3 tgpig[[threadgroup_position_in_grid]],
+//        uint3  tgpg[[threadgroups_per_grid]],
+//        uint3 tpitg[[thread_position_in_threadgroup]],
+//        uint3   ntg[[threads_per_threadgroup]]);
+//
+//template <typename T>
+//kernel void kernel_im2col_ext(
+//        constant ggml_metal_kargs_im2col & args,
+//        device const float * x,
+//        device        char * dst,
+//        uint3 tgpig[[threadgroup_position_in_grid]],
+//        uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
+//        uint3 tpitg[[thread_position_in_threadgroup]],
+//        uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
+//    const int64_t KHW = (int64_t)args.KHW;
+//
+//    const int64_t d   = tgpig[0] / args.CHW;
+//    const int64_t chw = tgpig[0] % args.CHW;
+//    const int64_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
+//    const int64_t HW = tgpig[0] % KHW;
+//
+//    const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
+//    if (tpitg_0 >= args.N) {
+//        return;
+//    }
+//
+//    const int64_t tpitg_1 = HW / args.KW;
+//    const int64_t tpitg_2 = HW % args.KW;
+//
+//    const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
+//    const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
+//
+//    const int64_t offset_dst =
+//        (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
+//        (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
+//
+//    device T * pdst = (device T *) (dst);
+//
+//    if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
+//        pdst[offset_dst] = 0.0f;
+//    } else {
+//        const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
+//        pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
+//    }
+//}
+//
+//template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
+//template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;

 typedef void (conv_transpose_1d_t)(
        constant ggml_metal_kargs_conv_transpose_1d & args,
@@ -3721,6 +3721,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT
            }
            float best = 0;
            float scale = max/(2*kMaxQ-1);
+            for (int k = 0; k < 8; ++k) is_on_grid[k] = true;
            for (int is = -15; is <= 15; ++is) {
                float id = (2*kMaxQ-1+is*0.2f)/max;
                float this_scale = 1/id;
@@ -31,6 +31,12 @@
 #include <filesystem>
 #include <algorithm>

+static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
+
+#define LOG_DBG(...) \
+    do { if (RPC_DEBUG) GGML_LOG_DEBUG(__VA_ARGS__); } while (0)
+
+
 namespace fs = std::filesystem;

 static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
@@ -47,7 +53,7 @@ struct socket_t {
    sockfd_t fd;
    socket_t(sockfd_t fd) : fd(fd) {}
    ~socket_t() {
-        GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
+        LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
 #ifdef _WIN32
        closesocket(this->fd);
 #else
@@ -265,14 +271,14 @@ static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
        return nullptr;
    }
    if (!set_no_delay(sockfd)) {
-        fprintf(stderr, "Failed to set TCP_NODELAY\n");
+        GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
        return nullptr;
    }
    addr.sin_family = AF_INET;
    addr.sin_port = htons(port);
    struct hostent * server = gethostbyname(host);
    if (server == NULL) {
-        fprintf(stderr, "Cannot resolve host '%s'\n", host);
+        GGML_LOG_ERROR("Cannot resolve host '%s'\n", host);
        return nullptr;
    }
    memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
@@ -289,7 +295,7 @@ static std::shared_ptr<socket_t> socket_accept(sockfd_t srv_sockfd) {
        return nullptr;
    }
    if (!set_no_delay(client_socket_fd)) {
-        fprintf(stderr, "Failed to set TCP_NODELAY\n");
+        GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
        return nullptr;
    }
    return client_socket;
@@ -302,11 +308,11 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
        return nullptr;
    }
    if (!set_reuse_addr(sockfd)) {
-        fprintf(stderr, "Failed to set SO_REUSEADDR\n");
+        GGML_LOG_ERROR("Failed to set SO_REUSEADDR\n");
        return nullptr;
    }
    if (inet_addr(host) == INADDR_NONE) {
-        fprintf(stderr, "Invalid host address: %s\n", host);
+        GGML_LOG_ERROR("Invalid host address: %s\n", host);
        return nullptr;
    }
    struct sockaddr_in serv_addr;
@@ -349,7 +355,7 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
            return false;
        }
        if (n == 0) {
-            GGML_LOG_ERROR("recv returned 0 (peer closed?)\n");
+            LOG_DBG("recv returned 0 (peer closed?)\n");
            return false;
        }
        bytes_recv += (size_t)n;
@@ -383,7 +389,7 @@ static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
    try {
        input.resize(size);
    } catch (const std::bad_alloc & e) {
-        fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", size);
+        GGML_LOG_ERROR("Failed to allocate input buffer of size %" PRIu64 "\n", size);
        return false;
    }
    return recv_data(sockfd, input.data(), size);
@@ -443,11 +449,11 @@ static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
    bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
    RPC_STATUS_ASSERT(status);
    if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
-        fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
+        GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
        return false;
    }
    if (response.minor != RPC_PROTO_MINOR_VERSION || response.patch != RPC_PROTO_PATCH_VERSION) {
-        fprintf(stderr, "WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
+        GGML_LOG_INFO("WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
    }
    return true;
 }
@@ -488,7 +494,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
    if (!check_server_version(sock)) {
        return nullptr;
    }
-    GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
+    LOG_DBG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
    sockets[endpoint] = sock;
    return sock;
 }
@@ -809,7 +815,7 @@ ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
    }
    auto sock = get_socket(endpoint);
    if (sock == nullptr) {
-        fprintf(stderr, "Failed to connect to %s\n", endpoint);
+        GGML_LOG_ERROR("Failed to connect to %s\n", endpoint);
        return nullptr;
    }
    size_t alignment = get_alignment(sock);
@@ -909,7 +915,7 @@ void rpc_server::hello(rpc_msg_hello_rsp & response) {
    response.major = RPC_PROTO_MAJOR_VERSION;
    response.minor = RPC_PROTO_MINOR_VERSION;
    response.patch = RPC_PROTO_PATCH_VERSION;
-    GGML_PRINT_DEBUG("[%s] version: %d.%d.%d\n", __func__, response.major, response.minor, response.patch);
+    LOG_DBG("[%s] version: %d.%d.%d\n", __func__, response.major, response.minor, response.patch);
 }

 bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
@@ -929,7 +935,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
        GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
        return false;
    }
-
+    LOG_DBG("[%s] buffer: %p, data: %p\n", __func__, (void*)tensor->buffer, tensor->data);
    if (tensor->buffer == nullptr) {
        //No buffer allocated.
        buft = ggml_backend_get_default_buffer_type(backend);
@@ -937,7 +943,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
        buft = tensor->buffer->buft;
    }

-    response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
+    response.alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor);

    return true;
 }
@@ -950,29 +956,29 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
    if (buffer != nullptr) {
        response.remote_ptr = reinterpret_cast<uint64_t>(buffer);
        response.remote_size = buffer->size;
-        GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
+        LOG_DBG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
        buffers.insert(buffer);
    } else {
-        GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
+        LOG_DBG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
    }
 }

 void rpc_server::get_alignment(rpc_msg_get_alignment_rsp & response) {
    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
    size_t alignment = ggml_backend_buft_get_alignment(buft);
-    GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
+    LOG_DBG("[%s] alignment: %lu\n", __func__, alignment);
    response.alignment = alignment;
 }

 void rpc_server::get_max_size(rpc_msg_get_max_size_rsp & response) {
    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
    size_t max_size = ggml_backend_buft_get_max_size(buft);
-    GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
+    LOG_DBG("[%s] max_size: %lu\n", __func__, max_size);
    response.max_size = max_size;
 }

 bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) {
-    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
+    LOG_DBG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
    if (buffers.find(buffer) == buffers.end()) {
        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
@@ -984,7 +990,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
 }

 bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
-    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
+    LOG_DBG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
    if (buffers.find(buffer) == buffers.end()) {
        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
@@ -996,7 +1002,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
 }

 bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
-    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
+    LOG_DBG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
    if (buffers.find(buffer) == buffers.end()) {
        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
@@ -1073,7 +1079,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        return false;
    }
-    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
+    LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);

    // sanitize tensor->data
    {
@@ -1096,7 +1102,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
        fs::path cache_file = fs::path(cache_dir) / hash_str;
        std::ofstream ofs(cache_file, std::ios::binary);
        ofs.write((const char *)data, size);
-        printf("[%s] saved to '%s'\n", __func__, cache_file.c_str());
+        GGML_LOG_INFO("[%s] saved to '%s'\n", __func__, cache_file.c_str());
    }
    ggml_backend_tensor_set(tensor, data, offset, size);
    return true;
@@ -1142,8 +1148,8 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        return false;
    }
-    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
-        __func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);
+    LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
+            __func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);

    // sanitize tensor->data
    {
@@ -1177,7 +1183,7 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
        GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
        return false;
    }
-
+    LOG_DBG("[%s] buffer: %p, data: %p\n", __func__, (void*)tensor->buffer, tensor->data);
    // Call the backend's buffer_init_tensor function
    ggml_backend_buffer_t buffer = tensor->buffer;
    if (buffer && buffer->iface.init_tensor) {
@@ -1210,7 +1216,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        return false;
    }
-    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
+    LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);

    // sanitize tensor->data
    {
@@ -1254,7 +1260,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
    uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);

    if (dst_data + src_size > dst_base + dst_buf_sz) {
-        GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
+        GGML_LOG_ERROR("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
                         "    write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
                         "    buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
                         __func__,
@@ -1265,8 +1271,8 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
        return false;
    }

-    GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
-                     __func__, (void*) src->buffer, (void*) dst->buffer);
+    LOG_DBG("[%s] src->buffer: %p, dst->buffer: %p\n",
+            __func__, (void*) src->buffer, (void*) dst->buffer);

    response.result = ggml_backend_buffer_copy_tensor(src, dst);
    return true;
@@ -1342,7 +1348,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
        return false;
    }
    const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
-    GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
+    LOG_DBG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);

    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);

@@ -1394,7 +1400,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
    }
    // the first command sent by the client must be HELLO
    if (cmd != RPC_CMD_HELLO) {
-        fprintf(stderr, "Expected HELLO command, update client\n");
+        GGML_LOG_ERROR("Expected HELLO command, update client\n");
        return;
    }
    if (!recv_msg(sockfd, nullptr, 0)) {
@@ -1411,7 +1417,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
        }
        if (cmd >= RPC_CMD_COUNT) {
            // fail fast if the command is invalid
-            fprintf(stderr, "Unknown command: %d\n", cmd);
+            GGML_LOG_ERROR("Unknown command: %d\n", cmd);
            break;
        }
        switch (cmd) {
@@ -1599,7 +1605,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                break;
            }
            default: {
-                fprintf(stderr, "Unknown command: %d\n", cmd);
+                GGML_LOG_ERROR("Unknown command: %d\n", cmd);
                return;
            }
        }
@@ -96,6 +96,7 @@ class Keys:
        FEED_FORWARD_LENGTH               = "{arch}.feed_forward_length"
        EXPERT_FEED_FORWARD_LENGTH        = "{arch}.expert_feed_forward_length"
        EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
+        EXPERT_CHUNK_FEED_FORWARD_LENGTH  = "{arch}.expert_chunk_feed_forward_length"
        USE_PARALLEL_RESIDUAL             = "{arch}.use_parallel_residual"
        TENSOR_DATA_LAYOUT                = "{arch}.tensor_data_layout"
        EXPERT_COUNT                      = "{arch}.expert_count"
@@ -104,6 +105,8 @@ class Keys:
        EXPERT_WEIGHTS_SCALE              = "{arch}.expert_weights_scale"
        EXPERT_WEIGHTS_NORM               = "{arch}.expert_weights_norm"
        EXPERT_GATING_FUNC                = "{arch}.expert_gating_func"
+        EXPERT_GROUP_SCALE                = "{arch}.expert_group_scale"
+        EXPERTS_PER_GROUP                 = "{arch}.experts_per_group"
        MOE_EVERY_N_LAYERS                = "{arch}.moe_every_n_layers"
        NEXTN_PREDICT_LAYERS              = "{arch}.nextn_predict_layers"
        POOLING_TYPE                      = "{arch}.pooling_type"
@@ -401,6 +404,7 @@ class MODEL_ARCH(IntEnum):
    LLADA            = auto()
    LLADA_MOE        = auto()
    SEED_OSS         = auto()
+    GROVEMOE         = auto()


 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -450,6 +454,9 @@ class MODEL_TENSOR(IntEnum):
    FFN_GATE_SHEXP       = auto()
    FFN_DOWN_SHEXP       = auto()
    FFN_UP_SHEXP         = auto()
+    FFN_GATE_CHEXP       = auto()
+    FFN_DOWN_CHEXP       = auto()
+    FFN_UP_CHEXP         = auto()
    FFN_EXP_PROBS_B      = auto()
    ATTN_Q_NORM          = auto()
    ATTN_K_NORM          = auto()
@@ -738,6 +745,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.LLADA:            "llada",
    MODEL_ARCH.LLADA_MOE:        "llada-moe",
    MODEL_ARCH.SEED_OSS:         "seed_oss",
+    MODEL_ARCH.GROVEMOE:         "grovemoe",
 }

 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -784,6 +792,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.FFN_GATE_SHEXP:            "blk.{bid}.ffn_gate_shexp",
    MODEL_TENSOR.FFN_DOWN_SHEXP:            "blk.{bid}.ffn_down_shexp",
    MODEL_TENSOR.FFN_UP_SHEXP:              "blk.{bid}.ffn_up_shexp",
+    MODEL_TENSOR.FFN_GATE_CHEXP:            "blk.{bid}.ffn_gate_chexps",
+    MODEL_TENSOR.FFN_DOWN_CHEXP:            "blk.{bid}.ffn_down_chexps",
+    MODEL_TENSOR.FFN_UP_CHEXP:              "blk.{bid}.ffn_up_chexps",
    MODEL_TENSOR.FFN_ACT:                   "blk.{bid}.ffn",
    MODEL_TENSOR.FFN_NORM_EXP:              "blk.{bid}.ffn_norm_exps",
    MODEL_TENSOR.FFN_GATE_EXP:              "blk.{bid}.ffn_gate_exps",
@@ -2712,6 +2723,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_UP_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
    ],
+    MODEL_ARCH.GROVEMOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_CHEXP,
+        MODEL_TENSOR.FFN_DOWN_CHEXP,
+        MODEL_TENSOR.FFN_UP_CHEXP,
+    ],
    # TODO
 }

@@ -670,6 +670,9 @@ class GGUFWriter:
    def add_expert_shared_feed_forward_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)

+    def add_expert_chunk_feed_forward_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_CHUNK_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
    def add_parallel_residual(self, use: bool) -> None:
        self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)

@@ -757,6 +760,12 @@ class GGUFWriter:
    def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
        self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)

+    def add_expert_group_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value)
+
+    def add_experts_per_group(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERTS_PER_GROUP.format(arch=self.arch), count)
+
    def add_moe_every_n_layers(self, value: int) -> None:
        self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)

@@ -427,6 +427,10 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.shared_mlp.up_proj",             # hunyuan
        ),

+        MODEL_TENSOR.FFN_UP_CHEXP: (
+            "model.layers.{bid}.mlp.chunk_experts.up_proj",           # grovemoe
+        ),
+
        # AWQ-activation gate
        MODEL_TENSOR.FFN_ACT: (
            "transformer.blocks.{bid}.ffn.act",  # mpt
@@ -468,6 +472,10 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.shared_mlp.gate_proj",             # hunyuan
        ),

+        MODEL_TENSOR.FFN_GATE_CHEXP: (
+            "model.layers.{bid}.mlp.chunk_experts.gate_proj",           # grovemoe
+        ),
+
        # Feed-forward down
        MODEL_TENSOR.FFN_DOWN: (
            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox
@@ -524,6 +532,10 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.shared_mlp.down_proj",             # hunyuan
        ),

+        MODEL_TENSOR.FFN_DOWN_CHEXP: (
+            "model.layers.{bid}.mlp.chunk_experts.down_proj",           # grovemoe
+        ),
+
        MODEL_TENSOR.ATTN_Q_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
            "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
@@ -1329,24 +1329,25 @@ extern "C" {
    //
    // Performance utils
    //
-    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
+    // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
    //

    struct llama_perf_context_data {
-        double t_start_ms;
-        double t_load_ms;
-        double t_p_eval_ms;
-        double t_eval_ms;
+        // ms == milliseconds
+        double t_start_ms;  // absolute start time
+        double t_load_ms;   // time needed for loading the model
+        double t_p_eval_ms; // time needed for processing the prompt
+        double t_eval_ms;   // time needed for generating tokens

-        int32_t n_p_eval;
-        int32_t n_eval;
-        int32_t n_reused; // number of times a ggml compute graph had been reused
+        int32_t n_p_eval;   // number of prompt tokens
+        int32_t n_eval;     // number of generated tokens
+        int32_t n_reused;   // number of times a ggml compute graph had been reused
    };

    struct llama_perf_sampler_data {
-        double t_sample_ms;
+        double t_sample_ms; // time needed for sampling in ms

-        int32_t n_sample;
+        int32_t n_sample;   // number of sampled tokens
    };

    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
@@ -1358,6 +1359,9 @@ extern "C" {
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);

+    // print a breakdown of per-device memory use via LLAMA_LOG:
+    LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
+
    //
    // training
    //
@@ -98,6 +98,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLADA,            "llada"            },
    { LLM_ARCH_LLADA_MOE,        "llada-moe"        },
    { LLM_ARCH_SEED_OSS,         "seed_oss"         },
+    { LLM_ARCH_GROVEMOE,         "grovemoe"         },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_FEED_FORWARD_LENGTH,               "%s.feed_forward_length"               },
    { LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        "%s.expert_feed_forward_length"        },
    { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
+    { LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  "%s.expert_chunk_feed_forward_length"  },
    { LLM_KV_USE_PARALLEL_RESIDUAL,             "%s.use_parallel_residual"             },
    { LLM_KV_TENSOR_DATA_LAYOUT,                "%s.tensor_data_layout"                },
    { LLM_KV_EXPERT_COUNT,                      "%s.expert_count"                      },
@@ -133,6 +135,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
    { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
    { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
+    { LLM_KV_EXPERT_GROUP_SCALE,                "%s.expert_group_scale"                },
+    { LLM_KV_EXPERTS_PER_GROUP,                 "%s.experts_per_group"                 },
    { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                },
    { LLM_KV_NEXTN_PREDICT_LAYERS,              "%s.nextn_predict_layers"              },
    { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
@@ -721,6 +725,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_CLS_OUT,         "cls.output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
@@ -2185,6 +2190,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_GROVEMOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_CHEXPS,    "blk.%d.ffn_gate_chexps" },
+            { LLM_TENSOR_FFN_DOWN_CHEXPS,    "blk.%d.ffn_down_chexps" },
+            { LLM_TENSOR_FFN_UP_CHEXPS,      "blk.%d.ffn_up_chexps" },
+        },
+    },
    {
        LLM_ARCH_UNKNOWN,
        {
@@ -2317,6 +2345,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_FFN_DOWN_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
    {LLM_TENSOR_FFN_GATE_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
    {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_DOWN_CHEXPS,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_GATE_CHEXPS,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_UP_CHEXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
    {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    // altup / laurel (gemma 3n)
    {LLM_TENSOR_PER_LAYER_TOKEN_EMBD,       {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
@@ -102,6 +102,7 @@ enum llm_arch {
    LLM_ARCH_LLADA,
    LLM_ARCH_LLADA_MOE,
    LLM_ARCH_SEED_OSS,
+    LLM_ARCH_GROVEMOE,
    LLM_ARCH_UNKNOWN,
 };

@@ -129,6 +130,7 @@ enum llm_kv {
    LLM_KV_FEED_FORWARD_LENGTH,
    LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
    LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
+    LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
    LLM_KV_USE_PARALLEL_RESIDUAL,
    LLM_KV_TENSOR_DATA_LAYOUT,
    LLM_KV_EXPERT_COUNT,
@@ -137,6 +139,8 @@ enum llm_kv {
    LLM_KV_EXPERT_WEIGHTS_SCALE,
    LLM_KV_EXPERT_WEIGHTS_NORM,
    LLM_KV_EXPERT_GATING_FUNC,
+    LLM_KV_EXPERT_GROUP_SCALE,
+    LLM_KV_EXPERTS_PER_GROUP,
    LLM_KV_MOE_EVERY_N_LAYERS,
    LLM_KV_NEXTN_PREDICT_LAYERS,
    LLM_KV_POOLING_TYPE,
@@ -301,6 +305,9 @@ enum llm_tensor {
    LLM_TENSOR_FFN_DOWN_SHEXP,
    LLM_TENSOR_FFN_GATE_SHEXP,
    LLM_TENSOR_FFN_UP_SHEXP,
+    LLM_TENSOR_FFN_DOWN_CHEXPS,
+    LLM_TENSOR_FFN_GATE_CHEXPS,
+    LLM_TENSOR_FFN_UP_CHEXPS,
    LLM_TENSOR_FFN_EXP_PROBS_B,
    LLM_TENSOR_ATTN_Q_NORM,
    LLM_TENSOR_ATTN_K_NORM,
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
    n_reused    = 0;
 }

+std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
+    for (const auto & buft_size : model.memory_breakdown()) {
+        ret[buft_size.first].model += buft_size.second;
+    }
+    for (const auto & buft_size : memory->memory_breakdown()) {
+        ret[buft_size.first].context += buft_size.second;
+    }
+    for (const auto & backend_ptr : backends) {
+        ggml_backend_t backend = backend_ptr.get();
+        ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
+    }
+    return ret;
+}
+
 //
 // training
 //
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
    ctx->perf_reset();
 }

+void llama_memory_breakdown_print(const struct llama_context * ctx) {
+    const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
+
+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
+
+    std::vector<std::array<std::string, 9>> table_data;
+    table_data.reserve(devices.size());
+    const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
+    const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
+    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";
+
+    table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
+
+    constexpr size_t MiB = 1024 * 1024;
+    const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
+
+    // track seen buffer types to avoid double counting:
+    std::set<ggml_backend_buffer_type_t> seen_buffer_types;
+
+    // accumulative memory breakdown for each device and for host:
+    std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
+    llama_memory_breakdown_data              mb_host;
+
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (ggml_backend_buft_is_host(buft)) {
+            mb_host.model   += mb.model;
+            mb_host.context += mb.context;
+            mb_host.compute += mb.compute;
+            seen_buffer_types.insert(buft);
+            continue;
+        }
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (dev) {
+            int i_dev = -1;
+            for (size_t i = 0; i < devices.size(); i++) {
+                if (devices[i] == dev) {
+                    i_dev = i;
+                    break;
+                }
+            }
+            if (i_dev != -1) {
+                mb_dev[i_dev].model   += mb.model;
+                mb_dev[i_dev].context += mb.context;
+                mb_dev[i_dev].compute += mb.compute;
+                seen_buffer_types.insert(buft);
+                continue;
+            }
+        }
+    }
+
+    // print memory breakdown for each device:
+    for (size_t i = 0; i < devices.size(); i++) {
+        ggml_backend_dev_t          dev = devices[i];
+        llama_memory_breakdown_data mb  = mb_dev[i];
+
+        const std::string name = ggml_backend_dev_name(dev);
+        std::string desc = ggml_backend_dev_description(dev);
+        for (const std::string & prefix : desc_prefixes_strip) {
+            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
+                desc = desc.substr(prefix.length());
+            }
+        }
+
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+
+        const size_t self = mb.model + mb.context + mb.compute;
+        const size_t unaccounted = total - self - free;
+
+        table_data.push_back({
+            template_gpu,
+            "  - " + name + " (" + desc + ")",
+            std::to_string(total / MiB),
+            std::to_string(free / MiB),
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            std::to_string(unaccounted / MiB)});
+    }
+
+    // print memory breakdown for host:
+    {
+        const size_t self = mb_host.model + mb_host.context + mb_host.compute;
+        table_data.push_back({
+            template_other,
+            "  - Host",
+            "", // total
+            "", // free
+            std::to_string(self / MiB),
+            std::to_string(mb_host.model / MiB),
+            std::to_string(mb_host.context / MiB),
+            std::to_string(mb_host.compute / MiB),
+            ""}); // unaccounted
+    }
+
+    // print memory breakdown for all remaining buffer types:
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (seen_buffer_types.count(buft) == 1) {
+            continue;
+        }
+        const std::string name = ggml_backend_buft_name(buft);
+        const size_t self = mb.model + mb.context + mb.compute;
+        table_data.push_back({
+            template_other,
+            "  - " + name,
+            "", // total
+            "", // free
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            ""}); // unaccounted
+        seen_buffer_types.insert(buft);
+    }
+
+    for (size_t j = 1; j < table_data[0].size(); j++) {
+        size_t max_len = 0;
+        for (const auto & td : table_data) {
+            max_len = std::max(max_len, td[j].length());
+        }
+        for (auto & td : table_data) {
+            td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
+        }
+    }
+    for (const auto & td : table_data) {
+        LLAMA_LOG_INFO(td[0].c_str(),
+            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
+            td[6].c_str(), td[7].c_str(), td[8].c_str());
+    }
+}
+
 //
 // training
 //
@@ -17,9 +17,17 @@ class llama_batch_allocr;
 class llama_io_read_i;
 class llama_io_write_i;

+// "memory" as in abstract memory for the context
 struct llama_memory_i;
 struct llama_memory_context_i;

+// "memory" as in physical memory for a buffer type, in bytes
+struct llama_memory_breakdown_data {
+    size_t model   = 0; // memory allocated for the model
+    size_t context = 0; // memory allocated for the context
+    size_t compute = 0; // memory allocated for temporary compute buffers
+};
+
 struct llama_context {
    // init scheduler and compute buffers, reserve worst-case graphs
    llama_context(
@@ -144,6 +152,8 @@ struct llama_context {
    llama_perf_context_data perf_get_data() const;
    void perf_reset();

+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
+
    //
    // training
    //
@@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
        std::vector<int> target_pos(n_seqs_unq, -1);
        std::vector<int> target_row(n_seqs_unq, -1);

-        bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
+        const bool last = (
+             cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
+            (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
+        );

        for (int i = 0; i < n_tokens; ++i) {
            const llama_pos pos = ubatch->pos[i];
@@ -920,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        selection_probs = logits;
    }

+    if (arch == LLM_ARCH_GROVEMOE) {
+        selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
+        cb(selection_probs, "ffn_moe_probs_biased", il);
+    }
+
    // select experts
    ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
    cb(selected_experts->src[0], "ffn_moe_argsort", il);
    cb(selected_experts, "ffn_moe_topk", il);

-    ggml_tensor * weights = ggml_get_rows(ctx0,
-            ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+    if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
+        // TODO: Use scalar div instead when/if implemented
+        ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
+        selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
+        probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
+    } else {
+        probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
+    }
+
+    ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
    cb(weights, "ffn_moe_weights", il);

+
    if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
        weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
        weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
@@ -952,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        cb(weights, "ffn_moe_weights_scaled", il);
    }

+    //call early so that topk-moe can be used
+    ggml_build_forward_expand(gf, weights);
+
    cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);

    if (weight_before_ffn) {
@@ -1177,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
 }

 ggml_tensor * llm_graph_context::build_inp_cls() const {
-    auto inp = std::make_unique<llm_graph_input_cls>(cparams);
+    auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);

    auto & cur = inp->cls;

@@ -1877,34 +1897,32 @@ void llm_graph_context::build_pooling(
        case LLAMA_POOLING_TYPE_RANK:
            {
                ggml_tensor * inp_cls = build_inp_cls();
-                inp = ggml_get_rows(ctx0, inp, inp_cls);
+                cur = ggml_get_rows(ctx0, inp, inp_cls);

+                // classification head
+                // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
                if (cls) {
-                    // classification head
-                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
-                    cur = ggml_mul_mat(ctx0, cls, inp);
+                    cur = ggml_mul_mat(ctx0, cls, cur);
                    if (cls_b) {
                        cur = ggml_add(ctx0, cur, cls_b);
                    }
                    cur = ggml_tanh(ctx0, cur);
+                }

-                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
-                    if (cls_out) {
-                        cur = ggml_mul_mat(ctx0, cls_out, cur);
-                        if (cls_out_b) {
-                            cur = ggml_add(ctx0, cur, cls_out_b);
-                        }
-                    }
-                } else if (cls_out) {
-                    // Single layer classification head (direct projection)
-                    // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
-                    cur = ggml_mul_mat(ctx0, cls_out, inp);
+                // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                // Single layer classification head (direct projection)
+                // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
+                if (cls_out) {
+                    cur = ggml_mul_mat(ctx0, cls_out, cur);
                    if (cls_out_b) {
                        cur = ggml_add(ctx0, cur, cls_out_b);
                    }
-                } else {
-                    GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
+                }
+
+                // softmax for qwen3 reranker
+                if (arch == LLM_ARCH_QWEN3) {
+                    cur = ggml_soft_max(ctx0, cur);
                }
            } break;
        default:
@@ -206,7 +206,7 @@ public:

 class llm_graph_input_cls : public llm_graph_input_i {
 public:
-    llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
+    llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
    virtual ~llm_graph_input_cls() = default;

    void set_input(const llama_ubatch * ubatch) override;
@@ -214,6 +214,7 @@ public:
    ggml_tensor * cls; // I32 [n_batch]

    const llama_cparams cparams;
+    const llm_arch arch;
 };

 class llm_graph_input_rs : public llm_graph_input_i {
@@ -69,10 +69,13 @@ struct llama_hparams {
    uint32_t n_lora_kv          = 0;
    uint32_t n_ff_exp           = 0;
    uint32_t n_ff_shexp         = 0;
+    uint32_t n_ff_chexp         = 0;
    uint32_t n_expert_shared    = 0;
    uint32_t n_norm_groups      = 0;
+    uint32_t n_group_experts    = 0;

-    float    expert_weights_scale = 0.0;
+    float    expert_group_scale   = 0.05f;
+    float    expert_weights_scale = 0.0f;
    bool     expert_weights_norm  = false;
    uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
    uint32_t moe_every_n_layers   = 0;
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
    return kv_swa->seq_pos_max(seq_id);
 }

+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
+    for (const auto & buft_size : kv_swa->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
+}
+
 llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
    GGML_UNUSED(embd_all);

@@ -56,6 +56,8 @@ public:
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
    // state write/load

    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
    return cells.seq_pos_max(seq_id);
 }

+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
+    for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    }
+    return ret;
+}
+
 llama_memory_context_ptr llama_kv_cache::init_batch(
            llama_batch_allocr & balloc,
            uint32_t n_ubatch,
@@ -121,6 +121,8 @@ public:
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
    // state write/load

    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
    return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
 }

+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
+    for (const auto & buft_size : mem_recr->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
+}
+
 void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
    GGML_UNUSED(flags);

@@ -68,6 +68,8 @@ public:
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
    // state write/load

    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
    return result;
 }

+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
+    for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    }
+    return ret;
+}
+
 llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
    do {
        balloc.split_reset();
@@ -4,6 +4,7 @@
 #include "llama-graph.h"
 #include "llama-memory.h"

+#include <map>
 #include <set>
 #include <vector>

@@ -50,6 +51,8 @@ public:
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
    bool prepare(const std::vector<llama_ubatch> & ubatches);

    // find a contiguous slot of memory cells and emplace the ubatch there
@@ -2,6 +2,7 @@

 #include "llama.h"

+#include <map>
 #include <memory>
 #include <functional>

@@ -108,6 +109,8 @@ struct llama_memory_i {
    virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
    virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;

+    virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
+
    //
    // state write/read
    //
@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_1_7B:          return "1.7B";
        case LLM_TYPE_1_8B:          return "1.8B";
        case LLM_TYPE_2B:            return "2B";
+        case LLM_TYPE_2_6B:          return "2.6B";
        case LLM_TYPE_2_8B:          return "2.8B";
        case LLM_TYPE_2_9B:          return "2.9B";
        case LLM_TYPE_3B:            return "3B";
@@ -1977,10 +1978,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
                }
-                switch (hparams.n_embd) {
-                    case 1024: type = LLM_TYPE_350M; break;
-                    case 1536: type = LLM_TYPE_700M; break;
-                    case 2048: type = LLM_TYPE_1_2B; break;
+                switch (hparams.n_ff()) {
+                    case  4608: type = LLM_TYPE_350M; break;
+                    case  6912: type = LLM_TYPE_700M; break;
+                    case  8192: type = LLM_TYPE_1_2B; break;
+                    case 10752: type = LLM_TYPE_2_6B; break;
                    default:   type = LLM_TYPE_UNKNOWN;
                }
            } break;
@@ -2007,6 +2009,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_GROVEMOE:
+            {
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp);
+                ml.get_key(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
+                ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 48: type = LLM_TYPE_30B_A3B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
        default: throw std::runtime_error("unsupported model architecture");
    }

@@ -3165,6 +3180,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                    }

+                    // output rerank head
+                    cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];

@@ -5835,6 +5853,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
                    }
                } break;
+            case LLM_ARCH_GROVEMOE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
+                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
+                    GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+                        // MoE branch
+                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+                        const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
+                        const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
+
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+                        layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
+                        layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp,   n_embd, n_chunk_expert}, 0);
+                        layer.ffn_up_chexps   = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS,   "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
+                    }
+                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@@ -6003,6 +6068,14 @@ size_t llama_model::n_devices() const {
    return devices.size();
 }

+std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
+    for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    }
+    return ret;
+}
+
 uint64_t llama_model::n_elements() const {
    return pimpl->n_elements;
 }
@@ -6166,6 +6239,13 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
    }

+    if (arch == LLM_ARCH_GROVEMOE) {
+        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
+        LLAMA_LOG_INFO("%s: n_ff_chexp           = %d\n",     __func__, hparams.n_ff_chexp);
+        LLAMA_LOG_INFO("%s: n_group_experts      = %d\n",     __func__, hparams.n_group_experts);
+        LLAMA_LOG_INFO("%s: expert_group_scale   = %.2f\n",   __func__, hparams.expert_group_scale);
+    }
+
    vocab.print_info();
 }

@@ -18851,6 +18931,156 @@ struct llm_build_smallthinker : public llm_graph_context{
    }
 };

+struct llm_build_grovemoe : public llm_graph_context {
+    llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        auto * inp_attn = build_attn_inp_kv();
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
+            cb(probs, "ffn_moe_logits", il);
+
+            ggml_tensor * moe_out =
+                build_moe_ffn(cur,
+                        nullptr,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, true,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        il, probs);
+            cb(moe_out, "ffn_moe_out", il);
+            cur = moe_out;
+
+            // TODO: Only do the expert selection and weights once
+            moe_out =
+                build_moe_ffn(cur,
+                        nullptr,
+                        model.layers[il].ffn_up_chexps,
+                        model.layers[il].ffn_gate_chexps,
+                        model.layers[il].ffn_down_chexps,
+                        nullptr,
+                        n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
+                        LLM_FFN_SILU, true,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        il, probs);
+            cb(moe_out, "ffn_adj_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
+            cb(cur, "ffn_final_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
 llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
    llama_memory_i * res;

@@ -19377,6 +19607,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
                    llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
                }
            } break;
+        case LLM_ARCH_GROVEMOE:
+            {
+                llm = std::make_unique<llm_build_grovemoe>(*this, params);
+            } break;
        default:
            GGML_ABORT("fatal error");
    }
@@ -19582,6 +19816,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_SMALLTHINKER:
        case LLM_ARCH_GLM4_MOE:
        case LLM_ARCH_SEED_OSS:
+        case LLM_ARCH_GROVEMOE:
            return LLAMA_ROPE_TYPE_NEOX;

        case LLM_ARCH_QWEN2VL:
@@ -7,6 +7,7 @@
 #include "llama-memory.h"
 #include "llama-vocab.h"

+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -58,6 +59,7 @@ enum llm_type {
    LLM_TYPE_1_7B,
    LLM_TYPE_1_8B,
    LLM_TYPE_2B,
+    LLM_TYPE_2_6B,
    LLM_TYPE_2_8B,
    LLM_TYPE_2_9B,
    LLM_TYPE_3B,
@@ -273,6 +275,11 @@ struct llama_layer {
    struct ggml_tensor * ffn_down_shexp     = nullptr;
    struct ggml_tensor * ffn_up_shexp       = nullptr;

+    // ff adjugate experts (chexps)
+    struct ggml_tensor * ffn_gate_chexps     = nullptr;
+    struct ggml_tensor * ffn_down_chexps     = nullptr;
+    struct ggml_tensor * ffn_up_chexps       = nullptr;
+
    // ff bias
    struct ggml_tensor * ffn_gate_b = nullptr;
    struct ggml_tensor * ffn_down_b = nullptr; // b2
@@ -452,10 +459,12 @@ struct llama_model {

    std::string desc() const;

-    size_t size() const;
+    size_t size() const; // file size
    size_t n_tensors() const;
    size_t n_devices() const;

+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
+
    // total number of parameters in the model
    uint64_t n_elements() const;

@@ -219,3 +219,6 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
+
+llama_build_and_test(test-alloc.cpp)
+target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
@@ -0,0 +1,572 @@
+#include <ggml-alloc.h>
+#include <ggml-backend-impl.h>
+#include <ggml-cpp.h>
+#include <ggml-impl.h>
+#include <ggml.h>
+
+#include <algorithm>
+#include <exception>
+#include <memory>
+#include <vector>
+
+//
+// dummy backend with configurable max_buffer_size, tracks allocations
+
+uint8_t * const alloc_base = (uint8_t *) 16;
+
+struct dummy_backend_context {
+    size_t max_buffer_size = 64;
+    size_t alignment       = 8;
+
+    ggml_backend_buffer_i              buffer_interface;
+    std::vector<ggml_backend_buffer_t> buffers;
+
+    size_t allocated_total() const {
+        size_t n = 0;
+        for (ggml_backend_buffer_t buf : buffers) {
+            n += ggml_backend_buffer_get_size(buf);
+        }
+        return n;
+    }
+};
+
+// ggml_backend_buffer_type interface
+
+static const char * dummy_backend_buffer_type_get_name(ggml_backend_buffer_type_t) {
+    return "dummy_buffer_type";
+}
+
+static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    dummy_backend_context * ctx    = (dummy_backend_context *) buft->context;
+    ggml_backend_buffer_t & buffer = ctx->buffers.emplace_back();
+    buffer                         = ggml_backend_buffer_init(buft, ctx->buffer_interface, ctx, size);
+    return buffer;
+}
+
+static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->alignment;
+}
+
+static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->max_buffer_size;
+}
+
+static bool dummy_backend_buffer_type_is_host(ggml_backend_buffer_type_t) {
+    return true;
+}
+
+// ggml_backend_buffer interface
+
+static void dummy_backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buffer->context;
+
+    auto i = std::find(ctx->buffers.begin(), ctx->buffers.end(), buffer);
+    GGML_ASSERT(i != ctx->buffers.end());
+    ctx->buffers.erase(i);
+}
+
+static void * dummy_backend_buffer_get_base(ggml_backend_buffer_t) {
+    return alloc_base;
+}
+
+static ggml_status dummy_backend_buffer_init_tensor(ggml_backend_buffer_t, ggml_tensor *) {
+    return GGML_STATUS_SUCCESS;
+}
+
+static void dummy_backend_buffer_memset_tensor(ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {}
+
+static void dummy_backend_buffer_set_tensor(ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {}
+
+static void dummy_backend_buffer_get_tensor(ggml_backend_buffer_t, const ggml_tensor *, void *, size_t, size_t) {}
+
+static void dummy_backend_buffer_clear(ggml_backend_buffer_t, uint8_t) {}
+
+// dummy_backend (not really a full backend, just provides what gallocr needs)
+
+struct dummy_backend {
+    std::unique_ptr<dummy_backend_context> context;
+    ggml_backend_buffer_type               buffer_type;
+};
+
+static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
+    dummy_backend b{};
+    b.context                  = std::make_unique<dummy_backend_context>();
+    b.context->alignment       = alignment;
+    b.context->max_buffer_size = max_buffer_size;
+
+    b.context->buffer_interface.free_buffer   = dummy_backend_buffer_free_buffer;
+    b.context->buffer_interface.get_base      = dummy_backend_buffer_get_base;
+    b.context->buffer_interface.init_tensor   = dummy_backend_buffer_init_tensor;
+    b.context->buffer_interface.memset_tensor = dummy_backend_buffer_memset_tensor;
+    b.context->buffer_interface.set_tensor    = dummy_backend_buffer_set_tensor;
+    b.context->buffer_interface.get_tensor    = dummy_backend_buffer_get_tensor;
+    b.context->buffer_interface.clear         = dummy_backend_buffer_clear;
+
+    b.buffer_type.context             = b.context.get();
+    b.buffer_type.iface.get_name      = dummy_backend_buffer_type_get_name;
+    b.buffer_type.iface.alloc_buffer  = dummy_backend_buffer_type_alloc_buffer;
+    b.buffer_type.iface.get_alignment = dummy_backend_buffer_type_get_alignment;
+    b.buffer_type.iface.get_max_size  = dummy_backend_buffer_type_get_max_size;
+    b.buffer_type.iface.is_host       = dummy_backend_buffer_type_is_host;
+    return b;
+}
+
+//
+// test utilities
+
+struct test_context_with_graph {
+    ggml_context *   ctx;
+    ggml_cgraph *    graph;
+    ggml_context_ptr ctx_ptr;
+};
+
+static test_context_with_graph make_context() {
+    ggml_init_params params{};
+    params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
+    params.no_alloc = true;
+
+    ggml_context *   ctx     = ggml_init(params);
+    ggml_context_ptr ctx_ptr = ggml_context_ptr(ctx);
+    ggml_cgraph *    graph   = ggml_new_graph(ctx);
+    return { ctx, graph, std::move(ctx_ptr) };
+}
+
+static ggml_tensor * make_input_1d(ggml_context * ctx, int64_t n_elements) {
+    ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+    ggml_set_input(t);
+    return t;
+}
+
+static ggml_tensor * make_input_with_size(ggml_context * ctx, size_t size_bytes) {
+    GGML_ASSERT(size_bytes % 4 == 0);
+    return make_input_1d(ctx, size_bytes / 4);
+}
+
+static void assign_names(ggml_context * ctx, const char * prefix = "x") {
+    int i = 0;
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        ggml_format_name(t, "%s%d", prefix, i++);
+    }
+}
+
+static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
+    for (int i = 0; i < graph->n_leafs; ++i) {
+        if (strncmp(graph->leafs[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
+            return i;
+        }
+    }
+    fprintf(stderr, "leaf not found: %s\n", tensor_name);
+    return -1;
+}
+
+static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        if (strncmp(graph->nodes[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
+            return i;
+        }
+    }
+    fprintf(stderr, "node not found: %s", tensor_name);
+    return -1;
+}
+
+static ggml_gallocr_ptr allocate_graph(ggml_cgraph * graph, ggml_tensor * out, ggml_backend_buffer_type_t buft) {
+    ggml_set_output(out);
+    ggml_build_forward_expand(graph, out);
+
+    ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new(buft));
+    bool             result = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(result);
+    return galloc;
+}
+
+//
+// correctness checks for result allocations
+
+static void check_all_allocated(ggml_cgraph * graph) {
+    for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
+        ggml_tensor * t = ggml_graph_node(graph, i);
+        GGML_ASSERT(t->buffer != nullptr);
+        GGML_ASSERT(t->data != nullptr);
+    }
+}
+
+static void check_max_size(ggml_context * ctx) {
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        auto   buft     = ggml_backend_buffer_get_type(t->buffer);
+        size_t max_size = ggml_backend_buft_get_max_size(buft);
+        size_t offset   = (char *) t->data - (char *) ggml_backend_buffer_get_base(t->buffer);
+        GGML_ASSERT(t->data >= ggml_backend_buffer_get_base(t->buffer));
+        GGML_ASSERT((size_t) offset + ggml_nbytes(t) <= max_size);
+    }
+}
+
+static bool can_reuse_memory(ggml_cgraph * graph, int current_i, ggml_tensor * current, ggml_tensor * other) {
+    if (other->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        return false;
+    }
+    // Check if `other` is still "alive", ie. an input to any node after the `current` op
+    for (int i = current_i; i < ggml_graph_n_nodes(graph); ++i) {
+        ggml_tensor * t = ggml_graph_node(graph, i);
+        for (int s = 0; s < GGML_MAX_SRC; s++) {
+            if (t == current && ggml_op_can_inplace(t->op)) {
+                continue;
+            }
+            if (t->src[s] == other) {
+                return false;
+            }
+            if (t->src[s] && t->src[s]->view_src == other) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+static bool memory_overlap(ggml_tensor * a, ggml_tensor * b) {
+    if (a->buffer != b->buffer) {
+        return false;
+    }
+    int64_t a0 = (int64_t) a->data;
+    int64_t a1 = a0 + ggml_nbytes(a);
+    int64_t b0 = (int64_t) b->data;
+    int64_t b1 = b0 + ggml_nbytes(b);
+    return a1 > b0 && b1 > a0;
+}
+
+static ggml_tensor * get_view_source(ggml_tensor * t) {
+    while (t->view_src) {
+        t = t->view_src;
+    }
+    return t;
+}
+
+static void check_no_overlap(ggml_cgraph * graph) {
+    for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
+        for (int j = 0; j < i; ++j) {
+            ggml_tensor * t = ggml_graph_node(graph, i);
+            ggml_tensor * o = ggml_graph_node(graph, j);
+            GGML_ASSERT(t != o);
+
+            if (get_view_source(t) == get_view_source(o)) {
+                continue;
+            }
+            if (memory_overlap(t, o)) {
+                GGML_ASSERT(can_reuse_memory(graph, i, t, o));
+            }
+        }
+    }
+}
+
+//
+// test cases
+
+// Scenario where the first backend buffer is completely exhausted and there are further
+// tensors which require a second buffer
+static void test_max_size_too_many_tensors() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[7];
+    x[0] = make_input_with_size(ctx, 8);
+    x[1] = make_input_with_size(ctx, 8);
+    x[2] = make_input_with_size(ctx, 8);
+    x[3] = ggml_mul(ctx, x[0], x[1]);
+    x[4] = ggml_add(ctx, x[1], x[2]);
+    x[5] = ggml_add(ctx, x[3], x[0]);
+    x[6] = ggml_add(ctx, x[4], x[5]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[6], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
+}
+
+// Scenario where there is some space left in the first buffer, but not enough to accomodate
+// a larger tensor, so a second buffer is required
+static void test_max_size_tensor_too_large() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 16);    // chunk 0, [0 , 16)
+    x[1] = make_input_with_size(ctx, 8);     // chunk 0, [16, 24)
+    x[2] = ggml_concat(ctx, x[0], x[1], 0);  // chunk 1, [0 , 24)
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
+}
+
+// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
+// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
+// Backends may report an artificially lower max size in some cases for compatibility reasons.
+static void test_tensor_larger_than_max_size() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[1], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() == 24);
+}
+
+// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
+// require more. Expectation is that the last buffer should grow to fit everything,
+// leaving it to the backend to error out if it can't allocate that much.
+static void test_not_enough_chunks() {
+    const int max_chunks = 16;
+    const int max_size   = 8;
+
+    dummy_backend backend      = dummy_backend_init(max_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[max_chunks + 1];
+    for (int i = 0; i < max_chunks + 1; ++i) {
+        x[i] = make_input_with_size(ctx, max_size);
+    }
+    ggml_tensor * acc = x[0];
+    for (int i = 0; i < max_chunks; ++i) {
+        acc = ggml_add(ctx, acc, x[i + 1]);
+    }
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, acc, &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
+}
+
+// Fill up leftover unallocated space of a chunk after allocating a large tensor that
+// requires a new chunk.
+static void test_fill_leftover_space() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[4];
+    x[0] = make_input_with_size(ctx, 8);
+    x[1] = ggml_pad(ctx, x[0], 2, 0, 0, 0);
+    x[3] = ggml_mean(ctx, x[1]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[3], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 12 + 16);
+}
+
+// Check that views don't require any extra memory
+static void test_view_inplace() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[6];
+    x[0] = make_input_1d(ctx, 4);                // chunk 0, [0, 16)
+    x[1] = ggml_reshape_2d(ctx, x[0], 2, 2);     // view of x0
+    x[2] = ggml_permute(ctx, x[1], 1, 0, 2, 3);  // view of x0
+    x[3] = ggml_view_1d(ctx, x[2], 2, 4);        // view of x0
+    x[4] = make_input_1d(ctx, 2);                // chunk 0, [16, 24)
+    x[5] = ggml_add(ctx, x[3], x[4]);            // reuse (inplace add)
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[5], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 24);
+}
+
+static void test_reuse_and_free() {
+    dummy_backend backend      = dummy_backend_init(40);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = make_input_with_size(ctx, 8);
+    x[2] = make_input_with_size(ctx, 8);
+    x[3] = ggml_add(ctx, x[1], x[2]);        // reuse, free x2
+    x[4] = ggml_pad(ctx, x[0], 2, 0, 0, 0);  // alloc new buffer, free x0
+    x[5] = ggml_scale(ctx, x[4], 2.0f);      // alloc from free block
+    x[6] = ggml_add(ctx, x[4], x[5]);        // reuse, free x5
+    x[7] = ggml_view_1d(ctx, x[6], 2, 8);    // view
+    x[8] = ggml_add(ctx, x[3], x[7]);        // reuse
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
+}
+
+static void test_merge_free_block(size_t max_buffer_size) {
+    dummy_backend backend      = dummy_backend_init(max_buffer_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = make_input_with_size(ctx, 16);
+    x[2] = make_input_with_size(ctx, 16);
+    x[3] = ggml_mean(ctx, x[0]);
+    x[4] = ggml_mean(ctx, x[1]);
+    x[5] = ggml_pad(ctx, x[2], 2, 0, 0, 0);
+    x[6] = ggml_add(ctx, x[3], x[4]);
+    x[7] = ggml_pad(ctx, x[6], 5, 0, 0, 0);
+    x[8] = ggml_add(ctx, x[5], x[7]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
+}
+
+// Check that previously allocated but freed memory is preferred over allocating
+// additional memory, even if the remaining space in a chunk would match tensor size better
+static void test_prefer_already_allocated_memory() {
+    dummy_backend backend      = dummy_backend_init(32, /*align*/ 4);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 24);  // [24b][8b unused]
+    x[1] = ggml_mean(ctx, x[0]);           // [24b free][4b][4b unused]
+    x[2] = ggml_mean(ctx, x[1]);           // should be allocated in the 24b block
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() <= 28);
+}
+
+// test for allocating on multiple devices with some tensors in the graph
+// allocated externally (not by gallocr).
+static void test_multiple_buffer_types() {
+    dummy_backend backend_a = dummy_backend_init(32);
+    dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
+
+    auto [ctx_a, _a, ctx_a_ptr] = make_context();
+    auto [ctx_b, _b, ctx_b_ptr] = make_context();
+    auto [ctx, graph, ctx_ptr]  = make_context();
+
+    ggml_tensor * a[2];
+    a[0] = make_input_with_size(ctx_a, 16);
+    a[1] = make_input_with_size(ctx_a, 16);
+    assign_names(ctx_a, "a");
+
+    ggml_tensor * b[2];
+    b[0] = make_input_with_size(ctx_b, 24);
+    b[1] = make_input_with_size(ctx_b, 4);
+    assign_names(ctx_b, "b");
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = ggml_mul(ctx, x[0], a[0]);
+    x[2] = ggml_pad(ctx, x[1], 2, 0, 0, 0);
+    x[3] = ggml_mul(ctx, x[2], b[0]);
+    x[4] = ggml_mean(ctx, x[3]);
+    x[5] = ggml_add(ctx, x[4], b[1]);
+    x[6] = ggml_pad(ctx, x[5], 3, 0, 0, 0);
+    x[7] = ggml_add(ctx, x[6], a[1]);
+    x[8] = ggml_scale(ctx, x[7], 2.0f);
+    assign_names(ctx, "x");
+
+    ggml_backend_buffer_ptr    buf_a(ggml_backend_alloc_ctx_tensors_from_buft(ctx_a, &backend_a.buffer_type));
+    ggml_backend_buffer_ptr    buf_b(ggml_backend_alloc_ctx_tensors_from_buft(ctx_b, &backend_b.buffer_type));
+    ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
+
+    // assign buffer types manually to avoid extra complexity from backend scheduler
+    ggml_set_output(x[8]);
+    ggml_build_forward_expand(graph, x[8]);
+
+    GGML_ASSERT(graph->n_leafs == 5);
+    int leaf_buffer_ids[5];
+    leaf_buffer_ids[get_leaf_id(graph, "a0")] = 0;
+    leaf_buffer_ids[get_leaf_id(graph, "a1")] = 0;
+    leaf_buffer_ids[get_leaf_id(graph, "b0")] = 1;
+    leaf_buffer_ids[get_leaf_id(graph, "b1")] = 1;
+    leaf_buffer_ids[get_leaf_id(graph, "x0")] = 0;
+
+    GGML_ASSERT(graph->n_nodes == 8);
+    int node_buffer_ids[8];
+    node_buffer_ids[get_node_id(graph, "x1")] = 0;
+    node_buffer_ids[get_node_id(graph, "x2")] = 0;
+    node_buffer_ids[get_node_id(graph, "x3")] = 1;
+    node_buffer_ids[get_node_id(graph, "x4")] = 1;
+    node_buffer_ids[get_node_id(graph, "x5")] = 1;
+    node_buffer_ids[get_node_id(graph, "x6")] = 1;
+    node_buffer_ids[get_node_id(graph, "x7")] = 0;
+    node_buffer_ids[get_node_id(graph, "x8")] = 0;
+
+    ggml_gallocr_ptr galloc(ggml_gallocr_new_n(bufts, 2));
+    ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    ggml_gallocr_alloc_graph(galloc.get(), graph);
+
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend_a.context->allocated_total() <= 32 + 32 + 24);
+    GGML_ASSERT(backend_b.context->allocated_total() <= 32 + 24);
+}
+
+static void test_buffer_size_zero() {
+    dummy_backend backend_a    = dummy_backend_init(SIZE_MAX);
+    dummy_backend backend_b    = dummy_backend_init(SIZE_MAX);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+
+    ggml_set_output(x[1]);
+    ggml_build_forward_expand(graph, x[1]);
+
+    int leaf_buffer_ids[1] = { 0 };
+    int node_buffer_ids[1] = { 0 };
+
+    ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
+    ggml_gallocr_ptr           galloc   = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
+    bool                       res1     = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    bool                       res2     = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(res1 && res2);
+
+    check_all_allocated(graph);
+    GGML_ASSERT(backend_a.context->allocated_total() == 16);
+    GGML_ASSERT(backend_b.context->allocated_total() == 0);
+}
+
+static void run(const char * name, void (*f)()) {
+    printf("%s ", name);
+    fflush(stdout);
+    f();
+    printf("PASSED\n");
+}
+
+int main() {
+    run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
+    run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
+    run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
+    run("test_not_enough_chunks", test_not_enough_chunks);
+    run("test_fill_leftover_space", test_fill_leftover_space);
+    run("test_view_inplace", test_view_inplace);
+    run("test_reuse_and_free", test_reuse_and_free);
+    run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
+    run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
+    run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
+    run("test_multiple_buffer_types", test_multiple_buffer_types);
+    run("test_buffer_size_zero", test_buffer_size_zero);
+    return 0;
+}
@@ -4418,6 +4418,49 @@ struct test_argsort : public test_case {
    }
 };

+struct test_topk_moe: public test_case {
+    const std::array<int64_t, 4> ne;
+    const int n_expert_used;
+    const bool with_norm;
+    test_topk_moe(std::array<int64_t, 4> ne = {10, 5, 1, 1}, int n_expert_used = 1, bool with_norm = false)
+    : ne(ne), n_expert_used(n_expert_used), with_norm(with_norm) {
+        GGML_ASSERT(n_expert_used <= ne[0]);
+    }
+
+    std::string vars() override {
+        return VARS_TO_STR3(ne, n_expert_used, with_norm);
+    }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "TOPK_MOE";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int n_expert = ne[0];
+        const int n_tokens = ne[1];
+
+        ggml_tensor * logits = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
+        ggml_tensor * probs  = ggml_soft_max(ctx, logits);
+        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
+
+        ggml_tensor * out = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+
+        if (with_norm) {
+            out = ggml_reshape_2d(ctx, out, n_expert_used, n_tokens);
+            ggml_tensor * weights_sum = ggml_sum_rows(ctx, out); // [1, n_tokens]
+
+            out = ggml_div(ctx, out, weights_sum); // [n_expert_used, n_tokens]
+            out = ggml_reshape_3d(ctx, out, 1, n_expert_used, n_tokens);
+        }
+
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
 // GGML_OP_SUM
 struct test_sum : public test_case {
    const ggml_type type;
@@ -6117,7 +6160,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        test_cases.emplace_back(new test_l2_norm      (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
    }
    for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
-        test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
+        test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
        test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
        test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
        test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
@@ -6588,6 +6631,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
    test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, {10, 5, 4, 3}));

+    for (bool with_norm : {false, true}) {
+        test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm));
+        test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));
+        test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm));
+    }
+
 #if 0
    // these tests are disabled to save execution time, sbut they can be handy for debugging
    test_cases.emplace_back(new test_llama(2, true));
@@ -260,14 +260,7 @@ int main(int argc, char * argv[]) {

    int64_t iterations = params.iterations;

-
-    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ 1*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
+    ggml_cpu_init();

    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        ggml_type type = (ggml_type) i;
@@ -359,7 +352,5 @@ int main(int argc, char * argv[]) {
        }
    }

-    ggml_free(ctx);
-
    return 0;
 }
@@ -3067,7 +3067,7 @@ struct image_manipulation {
        dst.buf.resize(3 * target_width * target_height);

        float Cc;
-        float C[5];
+        float C[5] = {};
        float d0, d2, d3, a0, a1, a2, a3;
        int i, j, k, jj;
        int x, y;
@@ -2060,6 +2060,7 @@ int main(int argc, char ** argv) {

    LOG("\n");
    llama_perf_context_print(ctx);
+    llama_memory_breakdown_print(ctx);

    llama_backend_free();

@@ -5093,21 +5093,15 @@ int main(int argc, char ** argv) {
            return;
        }

-        std::vector<server_tokens> tokenized_queries = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true);
-        if (tokenized_queries.size() != 1) {
-            res_error(res, format_error_response("\"query\" must contain only a single prompt", ERROR_TYPE_INVALID_REQUEST));
-        }
-
        // create and queue the task
        json responses = json::array();
        bool error = false;
        std::unordered_set<int> task_ids;
        {
            std::vector<server_task> tasks;
-            auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
-            tasks.reserve(tokenized_docs.size());
-            for (size_t i = 0; i < tokenized_docs.size(); i++) {
-                auto tmp = format_rerank(ctx_server.vocab, tokenized_queries[0], tokenized_docs[i]);
+            tasks.reserve(documents.size());
+            for (size_t i = 0; i < documents.size(); i++) {
+                auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
                server_task task   = server_task(SERVER_TASK_TYPE_RERANK);
                task.id            = ctx_server.queue_tasks.get_new_id();
                task.index         = i;
@@ -5268,42 +5262,6 @@ int main(int argc, char ** argv) {
    svr->Get (params.api_prefix + "/slots",               handle_slots);
    svr->Post(params.api_prefix + "/slots/:id_slot",      handle_slots_action);

-    // SPA fallback route - serve index.html for any route that doesn't match API endpoints
-    // This enables client-side routing for dynamic routes like /chat/[id]
-    if (params.webui && params.public_path.empty()) {
-        // Only add fallback when using embedded static files
-        svr->Get(".*", [](const httplib::Request & req, httplib::Response & res) {
-            // Skip API routes - they should have been handled above
-            if (req.path.find("/v1/") != std::string::npos ||
-                req.path.find("/health") != std::string::npos ||
-                req.path.find("/metrics") != std::string::npos ||
-                req.path.find("/props") != std::string::npos ||
-                req.path.find("/models") != std::string::npos ||
-                req.path.find("/api/tags") != std::string::npos ||
-                req.path.find("/completions") != std::string::npos ||
-                req.path.find("/chat/completions") != std::string::npos ||
-                req.path.find("/embeddings") != std::string::npos ||
-                req.path.find("/tokenize") != std::string::npos ||
-                req.path.find("/detokenize") != std::string::npos ||
-                req.path.find("/lora-adapters") != std::string::npos ||
-                req.path.find("/slots") != std::string::npos) {
-                return false; // Let other handlers process API routes
-            }
-
-            // Serve index.html for all other routes (SPA fallback)
-            if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
-                res.set_content("Error: gzip is not supported by this browser", "text/plain");
-            } else {
-                res.set_header("Content-Encoding", "gzip");
-                // COEP and COOP headers, required by pyodide (python interpreter)
-                res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
-                res.set_header("Cross-Origin-Opener-Policy", "same-origin");
-                res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
-            }
-            return false;
-        });
-    }
-
    //
    // Start the server
    //
@@ -64,3 +64,33 @@ cmake --build build -j --target llama-server && ./tools/server/tests/tests.sh
 ```

 To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
+
+### Debugging external llama-server
+It can sometimes be useful to run the server in a debugger when invesigating test
+failures. To do this, the environment variable `DEBUG_EXTERNAL=1` can be set
+which will cause the test to skip starting a llama-server itself. Instead, the
+server can be started in a debugger.
+
+Example using `gdb`:
+```console
+$ gdb --args ../../../build/bin/llama-server \
+    --host 127.0.0.1 --port 8080 \
+    --temp 0.8 --seed 42 \
+    --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf \
+    --batch-size 32 --no-slots --alias tinyllama-2 --ctx-size 512 \
+    --parallel 2 --n-predict 64
+```
+And a break point can be set in before running:
+```console
+(gdb) br server.cpp:4604
+(gdb) r
+main: server is listening on http://127.0.0.1:8080 - starting the main loop
+srv  update_slots: all slots are idle
+```
+
+And then the test in question can be run in another terminal:
+```console
+(venv) $ env DEBUG_EXTERNAL=1 ./tests.sh unit/test_chat_completion.py -v -x
+```
+And this should trigger the breakpoint and allow inspection of the server state
+in the debugger terminal.
@@ -99,8 +99,12 @@ class ServerProcess:
            self.debug = True
        if "PORT" in os.environ:
            self.server_port = int(os.environ["PORT"])
+        self.external_server = "DEBUG_EXTERNAL" in os.environ

    def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
+        if self.external_server:
+            print(f"[external_server]: Assuming external server running on {self.server_host}:{self.server_port}")
+            return
        if self.server_path is not None:
            server_path = self.server_path
        elif "LLAMA_SERVER_BIN_PATH" in os.environ:
@@ -244,6 +248,9 @@ class ServerProcess:
        raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")

    def stop(self) -> None:
+        if self.external_server:
+            print("[external_server]: Not stopping external server")
+            return
        if self in server_instances:
            server_instances.remove(self)
        if self.process:
@@ -1368,34 +1368,6 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
    return std::to_string(hash);
 }

-
-// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
-static server_tokens format_rerank(const struct llama_vocab * vocab, server_tokens & query, server_tokens & doc) {
-    server_tokens result = {};
-
-    // Get EOS token - use SEP token as fallback if EOS is not available
-    llama_token eos_token = llama_vocab_eos(vocab);
-    if (eos_token == LLAMA_TOKEN_NULL) {
-        eos_token = llama_vocab_sep(vocab);
-    }
-    if (llama_vocab_get_add_bos(vocab)) {
-        result.push_back(llama_vocab_bos(vocab));
-    }
-    result.push_back(query);
-    if (llama_vocab_get_add_eos(vocab)) {
-        result.push_back(eos_token);
-    }
-    if (llama_vocab_get_add_sep(vocab)) {
-        result.push_back(llama_vocab_sep(vocab));
-    }
-    result.push_back(doc);
-    if (llama_vocab_get_add_eos(vocab)) {
-        result.push_back(eos_token);
-    }
-    return result;
-}
-
-
 static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
    mtmd::bitmaps bitmaps;
    for (auto & file : files) {
@@ -1501,3 +1473,43 @@ static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab * voc
    }
    return result;
 }
+
+// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
+static server_tokens format_rerank(const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, const std::string & query, const std::string & doc) {
+    server_tokens result = {};
+
+    const char * rerank_prompt = llama_model_chat_template(model, "rerank");
+
+    if (rerank_prompt != nullptr) {
+        std::string prompt = rerank_prompt;
+        string_replace_all(prompt, "{query}"   , query);
+        string_replace_all(prompt, "{document}", doc  );
+        server_tokens tokens = tokenize_input_subprompt(vocab, mctx, prompt, false, true);
+        result.push_back(tokens);
+    } else {
+        // Get EOS token - use SEP token as fallback if EOS is not available
+        server_tokens query_tokens = tokenize_input_subprompt(vocab, mctx, query, false, false);
+        server_tokens doc_tokens   = tokenize_input_subprompt(vocab, mctx, doc,   false, false);
+        llama_token eos_token = llama_vocab_eos(vocab);
+        if (eos_token == LLAMA_TOKEN_NULL) {
+            eos_token = llama_vocab_sep(vocab);
+        }
+
+        if (llama_vocab_get_add_bos(vocab)) {
+            result.push_back(llama_vocab_bos(vocab));
+        }
+        result.push_back(query_tokens);
+        if (llama_vocab_get_add_eos(vocab)) {
+            result.push_back(eos_token);
+        }
+        if (llama_vocab_get_add_sep(vocab)) {
+            result.push_back(llama_vocab_sep(vocab));
+        }
+        result.push_back(doc_tokens);
+        if (llama_vocab_get_add_eos(vocab)) {
+            result.push_back(eos_token);
+        }
+    }
+
+    return result;
+}
@@ -4,7 +4,7 @@
 	"version": "1.0.0",
 	"type": "module",
 	"scripts": {
-		"dev": "vite dev --host 0.0.0.0 & storybook dev -p 6006 --ci",
+		"dev": "bash scripts/dev.sh",
 		"build": "vite build && ./scripts/post-build.sh",
 		"preview": "vite preview",
 		"prepare": "svelte-kit sync || echo ''",
@@ -20,7 +20,8 @@
 		"test:ui": "vitest --project=ui",
 		"test:unit": "vitest",
 		"storybook": "storybook dev -p 6006",
-		"build-storybook": "storybook build"
+		"build-storybook": "storybook build",
+		"cleanup": "rm -rf .svelte-kit build node_modules test-results"
 	},
 	"devDependencies": {
 		"@chromatic-com/storybook": "^4.0.1",
@@ -0,0 +1,103 @@
+#!/bin/bash
+
+cd ../../../
+
+# Check and install git hooks if missing
+check_and_install_hooks() {
+    local hooks_missing=false
+    
+    # Check for required hooks
+    if [ ! -f ".git/hooks/pre-commit" ] || [ ! -f ".git/hooks/pre-push" ] || [ ! -f ".git/hooks/post-push" ]; then
+        hooks_missing=true
+    fi
+    
+    if [ "$hooks_missing" = true ]; then
+        echo "🔧 Git hooks missing, installing them..."
+        cd tools/server/webui
+        if bash scripts/install-git-hooks.sh; then
+            echo "✅ Git hooks installed successfully"
+        else
+            echo "⚠️  Failed to install git hooks, continuing anyway..."
+        fi
+        cd ../../../
+    else
+        echo "✅ Git hooks already installed"
+    fi
+}
+
+# Install git hooks if needed
+check_and_install_hooks
+
+# Check if llama-server binary already exists
+if [ ! -f "build/bin/llama-server" ]; then
+    echo "Building llama-server..."
+    cmake -B build && cmake --build build --config Release -t llama-server
+else
+    echo "llama-server binary already exists, skipping build."
+fi
+
+# Start llama-server and capture output
+echo "Starting llama-server..."
+mkfifo server_output.pipe
+build/bin/llama-server -hf ggml-org/gpt-oss-20b-GGUF --jinja -c 0 --no-webui > server_output.pipe 2>&1 &
+SERVER_PID=$!
+
+# Function to wait for server to be ready
+wait_for_server() {
+    echo "Waiting for llama-server to be ready..."
+    local max_wait=60
+    local start_time=$(date +%s)
+    
+    # Read server output in background and look for the ready message
+    (
+        while IFS= read -r line; do
+            echo "🔍 Server: $line"
+            if [[ "$line" == *"server is listening on http://127.0.0.1:8080 - starting the main loop"* ]]; then
+                echo "✅ llama-server is ready!"
+                echo "READY" > server_ready.flag
+                break
+            fi
+        done < server_output.pipe
+    ) &
+    
+    # Wait for ready flag or timeout
+    while [ ! -f server_ready.flag ]; do
+        local current_time=$(date +%s)
+        local elapsed=$((current_time - start_time))
+        
+        if [ $elapsed -ge $max_wait ]; then
+            echo "❌ Server failed to start within $max_wait seconds"
+            rm -f server_ready.flag
+            return 1
+        fi
+        
+        sleep 1
+    done
+    
+    rm -f server_ready.flag
+    return 0
+}
+
+# Cleanup function
+cleanup() {
+    echo "🧹 Cleaning up..."
+    kill $SERVER_PID 2>/dev/null
+    rm -f server_output.pipe server_ready.flag
+    exit
+}
+
+# Set up signal handlers
+trap cleanup SIGINT SIGTERM
+
+# Wait for server to be ready
+if wait_for_server; then
+    echo "🚀 Starting development servers..."
+    cd tools/server/webui
+    storybook dev -p 6006 --ci & vite dev --host 0.0.0.0 &
+    
+    # Wait for all background processes
+    wait
+else
+    echo "❌ Failed to start development environment"
+    cleanup
+fi
@@ -1,14 +1,14 @@
 #!/bin/bash

-# Script to install pre-commit and post-commit hooks for webui
-# Pre-commit: formats, lints, checks, and builds code, stashes unstaged changes
-# Post-commit: automatically unstashes changes
+# Script to install pre-commit and pre-push hooks for webui
+# Pre-commit: formats code and runs checks
+# Pre-push: builds the project, stashes unstaged changes

 REPO_ROOT=$(git rev-parse --show-toplevel)
 PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit"
-POST_COMMIT_HOOK="$REPO_ROOT/.git/hooks/post-commit"
+PRE_PUSH_HOOK="$REPO_ROOT/.git/hooks/pre-push"

-echo "Installing pre-commit and post-commit hooks for webui..."
+echo "Installing pre-commit and pre-push hooks for webui..."

 # Create the pre-commit hook
 cat > "$PRE_COMMIT_HOOK" << 'EOF'
@@ -16,7 +16,7 @@ cat > "$PRE_COMMIT_HOOK" << 'EOF'

 # Check if there are any changes in the webui directory
 if git diff --cached --name-only | grep -q "^tools/server/webui/"; then
-    echo "Formatting webui code..."
+    echo "Formatting and checking webui code..."
    
    # Change to webui directory and run format
    cd tools/server/webui
@@ -27,20 +27,12 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then
        exit 1
    fi
    
-    # Stash any unstaged changes to avoid conflicts during format/build
-    echo "Stashing unstaged changes..."
-    git stash push --keep-index --include-untracked -m "Pre-commit hook: stashed unstaged changes"
-    STASH_CREATED=$?
-    
    # Run the format command
    npm run format

    # Check if format command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run format failed"
-        if [ $STASH_CREATED -eq 0 ]; then
-            echo "You can restore your unstaged changes with: git stash pop"
-        fi
        exit 1
    fi

@@ -50,9 +42,6 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then
    # Check if lint command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run lint failed"
-        if [ $STASH_CREATED -eq 0 ]; then
-            echo "You can restore your unstaged changes with: git stash pop"
-        fi
        exit 1
    fi

@@ -62,73 +51,151 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then
    # Check if check command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run check failed"
-        if [ $STASH_CREATED -eq 0 ]; then
-            echo "You can restore your unstaged changes with: git stash pop"
-        fi
        exit 1
    fi

-    # Run the build command
-    npm run build
-    
-    # Check if build command succeeded
-    if [ $? -ne 0 ]; then
-        echo "Error: npm run build failed"
-        if [ $STASH_CREATED -eq 0 ]; then
-            echo "You can restore your unstaged changes with: git stash pop"
-        fi
-        exit 1
-    fi
-
-    # Go back to repo root to add build output
+    # Go back to repo root
    cd ../../..
    
-    # Add the build output to staging area
-    git add tools/server/public/index.html.gz
-    
-    if [ $STASH_CREATED -eq 0 ]; then
-        echo "✅ Build completed. Your unstaged changes have been stashed."
-        echo "They will be automatically restored after the commit."
-        # Create a marker file to indicate stash was created by pre-commit hook
-        touch .git/WEBUI_STASH_MARKER
-    fi
-    
-    echo "Webui code formatted successfully"
+    echo "✅ Webui code formatted and checked successfully"
 fi

 exit 0
 EOF

-# Create the post-commit hook
-cat > "$POST_COMMIT_HOOK" << 'EOF'
+# Create the pre-push hook
+cat > "$PRE_PUSH_HOOK" << 'EOF'
 #!/bin/bash

-# Check if we have a stash marker from the pre-commit hook
-if [ -f .git/WEBUI_STASH_MARKER ]; then
-    echo "Restoring your unstaged changes..."
+# Check if there are any webui changes that need building
+WEBUI_CHANGES=$(git diff --name-only @{push}..HEAD | grep "^tools/server/webui/" || true)
+
+if [ -n "$WEBUI_CHANGES" ]; then
+    echo "Webui changes detected, checking if build is up-to-date..."
+    
+    # Change to webui directory
+    cd tools/server/webui
+    
+    # Check if npm is available and package.json exists
+    if [ ! -f "package.json" ]; then
+        echo "Error: package.json not found in tools/server/webui"
+        exit 1
+    fi
+    
+    # Check if build output exists and is newer than source files
+    BUILD_FILE="../public/index.html.gz"
+    NEEDS_BUILD=false
+    
+    if [ ! -f "$BUILD_FILE" ]; then
+        echo "Build output not found, building..."
+        NEEDS_BUILD=true
+    else
+        # Check if any source files are newer than the build output
+        if find src -newer "$BUILD_FILE" -type f | head -1 | grep -q .; then
+            echo "Source files are newer than build output, rebuilding..."
+            NEEDS_BUILD=true
+        fi
+    fi
+    
+    if [ "$NEEDS_BUILD" = true ]; then
+        echo "Building webui..."
+        
+        # Stash any unstaged changes to avoid conflicts during build
+        echo "Checking for unstaged changes..."
+        if ! git diff --quiet || ! git diff --cached --quiet --diff-filter=A; then
+            echo "Stashing unstaged changes..."
+            git stash push --include-untracked -m "Pre-push hook: stashed unstaged changes"
+            STASH_CREATED=$?
+        else
+            echo "No unstaged changes to stash"
+            STASH_CREATED=1
+        fi
+        
+        # Run the build command
+        npm run build
+        
+        # Check if build command succeeded
+        if [ $? -ne 0 ]; then
+            echo "Error: npm run build failed"
+            if [ $STASH_CREATED -eq 0 ]; then
+                echo "You can restore your unstaged changes with: git stash pop"
+            fi
+            exit 1
+        fi
+
+        # Go back to repo root
+        cd ../../..
+        
+        # Check if build output was created/updated
+        if [ -f "tools/server/public/index.html.gz" ]; then
+            # Add the build output and commit it
+            git add tools/server/public/index.html.gz
+            if ! git diff --cached --quiet; then
+                echo "Committing updated build output..."
+                git commit -m "chore: update webui build output"
+                echo "✅ Build output committed successfully"
+            else
+                echo "Build output unchanged"
+            fi
+        else
+            echo "Error: Build output not found after build"
+            if [ $STASH_CREATED -eq 0 ]; then
+                echo "You can restore your unstaged changes with: git stash pop"
+            fi
+            exit 1
+        fi
+        
+        if [ $STASH_CREATED -eq 0 ]; then
+            echo "✅ Build completed. Your unstaged changes have been stashed."
+            echo "They will be automatically restored after the push."
+            # Create a marker file to indicate stash was created by pre-push hook
+            touch .git/WEBUI_PUSH_STASH_MARKER
+        fi
+    else
+        echo "✅ Build output is up-to-date"
+    fi
+    
+    echo "✅ Webui ready for push"
+fi
+
+exit 0
+EOF
+
+# Create the post-push hook (for restoring stashed changes after push)
+cat > "$REPO_ROOT/.git/hooks/post-push" << 'EOF'
+#!/bin/bash
+
+# Check if we have a stash marker from the pre-push hook
+if [ -f .git/WEBUI_PUSH_STASH_MARKER ]; then
+    echo "Restoring your unstaged changes after push..."
    git stash pop
-    rm -f .git/WEBUI_STASH_MARKER
+    rm -f .git/WEBUI_PUSH_STASH_MARKER
    echo "✅ Your unstaged changes have been restored."
 fi

 exit 0
 EOF

-# Make both hooks executable
+# Make all hooks executable
 chmod +x "$PRE_COMMIT_HOOK"
-chmod +x "$POST_COMMIT_HOOK"
+chmod +x "$PRE_PUSH_HOOK"
+chmod +x "$REPO_ROOT/.git/hooks/post-push"

 if [ $? -eq 0 ]; then
-    echo "✅ Pre-commit and post-commit hooks installed successfully!"
-    echo "   Pre-commit:  $PRE_COMMIT_HOOK"
-    echo "   Post-commit: $POST_COMMIT_HOOK"
+    echo "✅ Git hooks installed successfully!"
+    echo "   Pre-commit: $PRE_COMMIT_HOOK"
+    echo "   Pre-push:   $PRE_PUSH_HOOK"
+    echo "   Post-push:  $REPO_ROOT/.git/hooks/post-push"
    echo ""
    echo "The hooks will automatically:"
-    echo "  • Format, lint, check, and build webui code before commits"
-    echo "  • Stash unstaged changes during the process"
-    echo "  • Restore your unstaged changes after the commit"
+    echo "  • Format and check webui code before commits (pre-commit)"
+    echo "  • Build webui code before pushes (pre-push)"
+    echo "  • Stash unstaged changes during build process"
+    echo "  • Restore your unstaged changes after the push"
    echo ""
-    echo "To test the hooks, make a change to a file in the webui directory and commit it."
+    echo "To test the hooks:"
+    echo "  • Make a change to a file in the webui directory and commit it (triggers format/check)"
+    echo "  • Push your commits to trigger the build process"
 else
    echo "❌ Failed to make hooks executable"
    exit 1
@@ -1,3 +1,3 @@
 rm -rf ../public/_app;
 rm ../public/favicon.svg;
-rm ../public/index.html;
+rm ../public/index.html;
@@ -50,7 +50,7 @@

 <div class="relative {justify === 'start' ? 'mt-2' : ''} flex h-6 items-center justify-{justify}">
 	<div
-		class="flex items-center text-xs text-muted-foreground transition-opacity group-hover:opacity-0"
+		class="hidden items-center text-xs text-muted-foreground transition-opacity md:flex md:group-hover:opacity-0"
 	>
 		{new Date(message.timestamp).toLocaleTimeString(undefined, {
 			hour: '2-digit',
@@ -61,14 +61,14 @@
 	<div
 		class="absolute top-0 {actionsPosition === 'left'
 			? 'left-0'
-			: 'right-0'} flex items-center gap-2 opacity-0 transition-opacity group-hover:opacity-100"
+			: 'right-0'} flex items-center gap-2 opacity-100 transition-opacity md:opacity-0 md:group-hover:opacity-100"
 	>
 		{#if siblingInfo && siblingInfo.totalSiblings > 1}
 			<ChatMessageBranchingControls {siblingInfo} {onNavigateToSibling} />
 		{/if}

 		<div
-			class="pointer-events-none inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-hover:pointer-events-auto group-hover:opacity-100"
+			class="pointer-events-auto inset-0 flex items-center gap-1 opacity-100 transition-all duration-150 md:pointer-events-none md:opacity-0 md:group-hover:pointer-events-auto md:group-hover:opacity-100"
 		>
 			<ActionButton icon={Copy} tooltip="Copy" onclick={onCopy} />

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Isaac McFadyen	e0539eb6ae	webui: switch to hash-based routing (alternative of #16079 ) (#16157 ) * Switched web UI to hash-based routing * Added hash to missed goto function call * Removed outdated SPA handling code * Fixed broken sidebar home link	2025-09-26 18:36:48 +03:00
Aleksander Grygier	5d0a40f390	Always show message actions for mobile UI + improvements for user message sizing (#16076 )	2025-09-26 15:59:07 +02:00
Radoslav Gerganov	d12a983659	codeowners : add rgerganov as owner of RPC [no ci] (#16279 )	2025-09-26 16:09:34 +03:00
Aleksei Nikiforov	cc1cfa277b	mtmd : fix uninitialized variable in bicubic_resize (#16275 ) Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> Co-authored-by: Aaron Teo <aaron.teo1@ibm.com>	2025-09-26 15:00:44 +02:00
Georgi Gerganov	54dbc37053	metal : report OOM errors (#16274 )	2025-09-26 14:14:28 +03:00
Adrien Gallouët	b995a10760	common : use cpp-httplib as a cURL alternative for downloads (#16185 ) * vendor : update httplib Signed-off-by: Adrien Gallouët <angt@huggingface.co> * common : use cpp-httplib as a cURL alternative for downloads The existing cURL implementation is intentionally left untouched to prevent any regressions and to allow for safe, side-by-side testing by toggling the `LLAMA_CURL` CMake option. Signed-off-by: Adrien Gallouët <angt@huggingface.co> * ggml : Bump to Windows 10 Signed-off-by: Adrien Gallouët <angt@huggingface.co> --------- Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2025-09-26 14:12:19 +03:00
Adrien Gallouët	4710dd31bb	build : fix build-ios-device (#16257 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2025-09-26 13:39:35 +03:00
Aaron Teo	9b26511857	ggml-cpu: implement MXFP4 SIMD for s390x (#16193 ) * ggml-cpu: impl mxfp4 s390x Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: missing s = sumf Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: fix incorrect kval_mxfp4 type Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: rework mxfp4 Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: missing delta calc Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: fix typo Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: fix typo for vec_splats Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: expand to 2 blocks per loop Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: add unroll to boost perf Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: back to 1 block per loop to test perf Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Revert "ggml-cpu: back to 1 block per loop to test perf" This reverts commit `1fe55724e2`. Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-cpu: rm unroll from single block Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> --------- Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>	2025-09-26 13:27:25 +03:00
Radoslav Gerganov	00217cd413	ci : create git tags for released docker images (#16008 ) * ci : create git tags for released docker images When releasing a docker image for build number X, we should also create the corresponding git tag. This allows users to easily checkout the corresponding source tree for given docker image. * Update .github/workflows/docker.yml Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update .github/workflows/docker.yml Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Apply suggestion from @CISC Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-09-26 10:19:23 +00:00
Daniel Bevenius	3b337b01a1	codeowners : add danbev as owner of build-xcframework.sh [no ci] (#16268 )	2025-09-26 08:53:36 +03:00
R0CKSTAR	a86a580a66	musa: upgrade musa sdk to 4.3.0 (#16240 ) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2025-09-26 02:56:38 +02:00
R0CKSTAR	0f7c69689f	musa: fix build warnings (#15611 ) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2025-09-26 02:56:10 +02:00
Sigbjørn Skjæret	835b2b915c	model : add GroveMoE support (#15510 ) * add GroveMoE support * remove constexpr that fails on certain compilers * revert crude scalar div implementation, use cast * build_attn_inp_kv_unified -> build_attn_inp_kv * fix build_attn * re-apply ffn_exps regex changes	2025-09-25 19:50:28 +02:00
Aaron Teo	b05a9d650f	vendors: update miniaudio version (#16212 ) * vendor: update miniaudio.h Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * vendor: update miniaudio.h Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> --------- Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>	2025-09-25 23:38:10 +08:00
rtaluyev	27052978e4	readme : update bindings (#16144 ) Link to Java JNA bindings to llama.cpp native libraries	2025-09-25 18:20:34 +03:00
Aman Gupta	077c94d0ca	CUDA: add a fused top-K MoE kernel (#16130 ) * CUDA: add a fused top-K MoE kernel This kernel does the following: 1. softmax over the logits per token [n_experts, n_tokens] 2. argmax reduce over the top-k (n_experts_used) logits 3. write weights + ids to global memory It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models * Refactor into ggml_cuda_should_use_topk_moe * Review: Use better coalescing pattern, use WARP_SIZE, store logits into registers before * Review: format + micro-optimizations * Fix bug: fix tie breakers * Add optional norm + clean-up code * Use smem for final write * Add bounds check * Use better memory pattern for writeback	2025-09-25 16:35:05 +02:00
Daniel Bevenius	aa3ee0eb0b	model-conversion : add embedding prompt file support (#15871 ) This commit adds support for passing a prompt file to the model conversion targets/scripts. It also updates the logits.cpp to print out embedding information in the same format as when running the original embedding model. The motivation for this is that it allows us to pass files of different sizes when running the converted models and validating the logits. This can be particularly important when testing the sliding window functionality of models where the sequence length needs to exceed a certain number of tokens to trigger the sliding window logic.	2025-09-25 12:02:36 +02:00
Daniel Bevenius	d0991da39d	server : add support for external server for tests (#16243 ) This commit adds support for using an externally started llama-server instance for the server tests. This can be enabled by setting the DEBUG_EXTERNAL environment variable. The motivation for this is to allow debugging of the server itself when investigating a test failure. Instructions for how to do this are added to the README.md file in the tests directory.	2025-09-25 11:36:47 +02:00
junchao-zhao	aa719c2f88	ggml : fix loongarch lsx compilation error (#15864 )	2025-09-25 12:22:55 +03:00
Johannes Gäßler	4cdd0bb453	docs: fix typo [no ci] (#16244 )	2025-09-25 12:12:27 +03:00
Douglas Hanley	b5bd037832	llama : add support for qwen3 reranker (#15824 )	2025-09-25 11:53:09 +03:00
Georgi Gerganov	dfcd53f7ec	metal : fuse NORM + MUL + ADD, support non-multiples of 4 (#16220 ) * metal : fuse NORM + MUL + ADD * metal : support norms of non-multiple of 4 * cont : fix comment [no ci]	2025-09-25 11:30:16 +03:00
Georgi Gerganov	4ea00794b8	metal : relax reorder conditions (#16216 )	2025-09-25 11:29:42 +03:00
Georgi Gerganov	02a6a82ae7	metal : restore im2col perf (#16219 )	2025-09-25 11:29:08 +03:00
Radoslav Gerganov	c498fc82fe	rpc : use ggml logging facilities Use RPC_DEBUG environment variable to enable debug messages. Add helper macro LOG_DBG() which does an early check of the env var before calling GGML_LOG_DEBUG(). Make sure we log a debug message for every server function.	2025-09-25 07:20:02 +00:00
Aaron Teo	e7a5130a20	codeowners: add ownership of zdnn backend [no ci] (#16232 ) add @Andreas-Krebbel to owners of zDNN backend Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>	2025-09-25 08:06:30 +03:00
Eve	bee378e098	ci: run the x64 and arm ci on the github machines instead (#16183 ) * run the x64 ci on regular machines * set up the same thing for arm fix test-quantize-perf just like #12306 * try to disable sve * add another sve run	2025-09-25 08:06:06 +03:00
Aaron Teo	5fb557653b	devops: fix s390x docker release failure (#16231 )	2025-09-25 11:36:30 +08:00
Aaron Teo	4ae88d07d0	codeowners: add ownership of zdnn backend [no ci] (#16229 ) add @AlekseiNikiforovIBM to owners of zDNN backend Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>	2025-09-25 00:25:04 +08:00
Johannes Gäßler	e789095502	llama: print memory breakdown on exit (#15860 ) * llama: print memory breakdown on exit	2025-09-24 16:53:48 +02:00
Acly	f2a789e334	ggml : split graph allocations according to backend max buffer size (#15815 ) * ggml : make gallocr respect the backend's max buffer size * if the graph requires more memory than can fit into a single allocation, split it into multiple backend buffers * vulkan: report the actual max allocation size in buffer type interface * fix missing newline, apple-clang warning * track size of individual chunks in ggml_dyn_tallocr and raise max chunks. revert to use suballocation_block_size as max chunk size for vulkan. * track (chunk, offset) pairs instead of "global" offsets through gallocr. * simpler, don't need loops to map between local/global offsets * touches more code * fix dyn_tallocr_max_size and initialization * fix memory leak when buffers are reused due to same buffer type appearing multiple times * make vbuffer allocation follow the same logic as backend_buffer did before * continue to use leftover unallocated space of previous chunks after a new one has been created * treat free blocks of each chunk as separate list * they're still allocated together, but start/end of each chunk is tracked, and allocate/free iterate over sub-ranges * exhaust freed blocks of all chunks before considering their last blocks with unallocated space * start with 0 chunks/blocks and create chunks as needed * allow the last chunk to grow beyond max size * refactor: move adding new free block and new chunk into separate functions * allocate chunks individually with a separate free-blocks list for each one * needs a bit more memory/allocations/indirections, but code is simpler * fix warnings (missing static) & debug checks	2025-09-24 16:17:49 +02:00
Tarek Dakhran	3a59971967	model : add label for LiquidAI LFM2-2.6B model (#16204 ) * model : add label for LiquidAI LFM2-2.6B model HF link: [LiquidAI/LFM2-2.6B](https://huggingface.co/LiquidAI/LFM2-2.6B). Support for GGUF conversion and inference is added in #14620. However, due to similar `n_embd`, it identifies as a 1.2B model. Fix the label by using `n_ff` to identify the model instead. Output of `llama-bench`: ``` \| model \| size \| params \| backend \| threads \| test \| t/s \| \| ------------------------------ \| ---------: \| ---------: \| ---------- \| ------: \| --------------: \| -------------------: \| \| lfm2 1.2B F16 \| 2.18 GiB \| 1.17 B \| CPU \| 10 \| pp512 \| 223.97 ± 5.32 \| \| lfm2 2.6B F16 \| 4.79 GiB \| 2.57 B \| CPU \| 10 \| pp512 \| 92.53 ± 4.14 \| \| lfm2 350M F16 \| 676.25 MiB \| 354.48 M \| CPU \| 10 \| pp512 \| 725.52 ± 11.70 \| \| lfm2 700M F16 \| 1.38 GiB \| 742.49 M \| CPU \| 10 \| pp512 \| 336.22 ± 12.93 \| ``` * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-09-24 13:42:26 +02:00
Jie Fu (傅杰)	63b54c81a6	model-conversion : make causal-verify-logits fails with model names containing "." (#16215 ) Signed-off-by: Jie Fu <jiefu@tencent.com>	2025-09-24 10:25:26 +02:00
Uilian Ries	152729f884	common : add missing chrono header for common.cpp (#16211 ) Signed-off-by: Uilian Ries <uilianries@gmail.com>	2025-09-24 09:53:47 +03:00
Sigbjørn Skjæret	c0c59c1157	codeowners : match all requirements files (#16214 )	2025-09-24 08:53:20 +02:00
Jie Fu (傅杰)	7735706b93	model-conversion : run-org-model.py fails to run on mac m1 (#16213 ) Signed-off-by: Jie Fu <jiefu@tencent.com>	2025-09-24 08:46:52 +02:00
Daniel Bevenius	4d9ea03d17	codeowners : use slash prefix for root files [no ci] (#16210 ) This commit adds a leading slash to the paths of root-level files in the CODEOWNERS file. The motivation for this is that these might otherwise match files in subdirectories that have other/additional owners will override them. Refs: https://github.com/ggml-org/llama.cpp/pull/16209#issuecomment-3326434274	2025-09-24 08:10:09 +02:00
Jie Fu (傅杰)	8ba548dae2	model-conversion : fix the make targets in the README.md (#16209 ) Fix two incorrect make targets in the readme. Signed-off-by: Jie Fu <jiefu@tencent.com>	2025-09-24 06:19:23 +02:00
Georgi Gerganov	f505bd83ca	ci : disable AMD workflows + update NVIDIA workflows (#16200 ) * ci : disable AMD workflows + update NVIDIA workflows * cont : fixes * cont : update nvidia vulkan workflows	2025-09-23 20:41:40 +03:00
Georgi Gerganov	0889589dbe	ci : enable Vulkan workflow on Mac (#16194 )	2025-09-23 13:44:25 +03:00
Xiangyan Sun	4e29084ba4	ggml-cpu: Respect cpumask settings (#16164 )	2025-09-23 11:58:12 +03:00
Sigbjørn Skjæret	f6b4af3d04	ggml : fix uninitialized is_on_grid in quantize_row_iq3_xxs_impl (#15928 ) * fix uninitialized is_on_grid in quantize_row_iq3_xxs_impl * change initialization to true	2025-09-23 10:25:20 +02:00