sync : ggml

ggml : bump version to 0.15.2 (ggml/1548)
pi : remove docs from system prompt (#24791 )
2026-06-19 12:17:45 +02:00 · 2026-06-19 10:19:14 +03:00 · 2026-06-19 10:19:14 +03:00 · 2026-06-19 09:34:00 +03:00 · 2026-06-19 09:22:34 +03:00 · 2026-06-19 08:55:38 +03:00
268 changed files with 17289 additions and 6049 deletions
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

@@ -37,7 +37,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -3,9 +3,9 @@ ARG UBUNTU_VERSION=24.04
 ARG CUDA_VERSION=12.8.1
 ARG GCC_VERSION=14
 # Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_DEV_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -5,9 +5,9 @@ ARG APP_REVISION=N/A

 ## Build Image

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
+FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build

-ARG GGML_SYCL_F16=OFF
+ARG GGML_SYCL_F16=ON
 ARG LEVEL_ZERO_VERSION=1.28.2
 ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
 RUN apt-get update && \
@@ -24,7 +24,8 @@ COPY . .

 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
-        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
+        && export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
@@ -42,7 +43,7 @@ RUN mkdir -p /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
+FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ascendai/cann:$ASCEND_VERSION AS build
+FROM docker.io/ascendai/cann:$ASCEND_VERSION AS build

 WORKDIR /app

@@ -30,7 +30,7 @@ RUN echo "Building with static libs" && \
    cmake --build build --config Release --target llama-completion

 # TODO: use image with NNRT
-FROM ascendai/cann:$ASCEND_VERSION AS runtime
+FROM docker.io/ascendai/cann:$ASCEND_VERSION AS runtime

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -2,9 +2,9 @@ ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc4.3.0
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_DEV_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_RUN_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -1,17 +1,17 @@
-ARG OPENVINO_VERSION_MAJOR=2026.0
-ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
+ARG OPENVINO_VERSION_MAJOR=2026.2
+ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
 ARG UBUNTU_VERSION=24.04

 # Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
+ARG IGC_VERSION=v2.34.4
+ARG IGC_VERSION_FULL=2_2.34.4+21428
+ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
+ARG IGDGMM_VERSION=22.10.0

 # Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
-ARG NPU_DRIVER_VERSION=v1.32.0
-ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
+ARG NPU_DRIVER_VERSION=v1.33.0
+ARG NPU_DRIVER_FULL=v1.33.0.20260529-26625960453
 ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2

 # Optional proxy build arguments
@@ -23,7 +23,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ## Build Image
-FROM ubuntu:${UBUNTU_VERSION} AS build
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build

 # Pass proxy args to build stage
 ARG http_proxy
@@ -46,13 +46,18 @@ RUN apt-get update && \
        intel-opencl-icd && \
    rm -rf /var/lib/apt/lists/*

-# Install OpenVINO for Ubuntu 24.04
+# OpenVINO toolkit and GPU/NPU drivers are cached via BuildKit cache mounts to avoid re-downloading on rebuilds.
+# Install OpenVINO for Ubuntu 24.04.
 ARG OPENVINO_VERSION_MAJOR
 ARG OPENVINO_VERSION_FULL
-RUN mkdir -p /opt/intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
-    tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
-    mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
+RUN --mount=type=cache,target=/var/cache/openvino,sharing=locked \
+    mkdir -p /opt/intel && \
+    TGZ=/var/cache/openvino/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
+    if [ ! -f "$TGZ" ]; then \
+        wget -O "$TGZ" https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz; \
+    fi && \
+    tar -xf "$TGZ" -C /opt/intel/ && \
+    mv /opt/intel/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
    cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
    echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
    cd - && \
@@ -68,14 +73,14 @@ COPY . .
 RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
    cmake -B build/ReleaseOV -G Ninja \
        -DCMAKE_BUILD_TYPE=Release \
+        -DLLAMA_BUILD_TESTS=OFF \
        -DGGML_OPENVINO=ON && \
-    cmake --build build/ReleaseOV -j$(nproc)"
+    cmake --build build/ReleaseOV --parallel "

-# Copy all necessary libraries
+# Copy all necessary libraries (build outputs + OpenVINO runtime libs)
 RUN mkdir -p /app/lib && \
-    find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
-    find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
-    find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
+    find build/ReleaseOV -name '*.so*' -exec cp -P {} /app/lib \; && \
+    find "${OpenVINO_DIR}/runtime/lib/intel64" -name '*.so*' -exec cp -P {} /app/lib \;

 # Create runtime directories and copy binaries
 RUN mkdir -p /app/full \
@@ -88,7 +93,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base Runtime Image
-FROM ubuntu:${UBUNTU_VERSION} AS base
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base

 # Pass proxy args to runtime stage
 ARG http_proxy
@@ -120,33 +125,41 @@ ARG IGC_VERSION_FULL
 ARG COMPUTE_RUNTIME_VERSION
 ARG COMPUTE_RUNTIME_VERSION_FULL
 ARG IGDGMM_VERSION
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/neo/
+RUN --mount=type=cache,target=/var/cache/intel-gpu,sharing=locked \
+    set -eux; \
+    cd /var/cache/intel-gpu; \
+    for url in \
+        https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
+        https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
+        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
+        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb ; do \
+        f=$(basename "$url"); \
+        [ -f "$f" ] || wget -q -O "$f" "$url"; \
+    done; \
+    apt-get update; \
+    apt-get install -y --no-install-recommends ./*.deb; \
+    rm -rf /var/lib/apt/lists/*

 # Install NPU drivers
 ARG NPU_DRIVER_VERSION
 ARG NPU_DRIVER_FULL
 ARG LIBZE1_VERSION
-RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
-    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/npu/
-
-RUN cd /tmp \
-    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
-    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
-    && rm libze1_${LIBZE1_VERSION}_amd64.deb
+RUN --mount=type=cache,target=/var/cache/intel-npu,sharing=locked \
+    set -eux; \
+    TGZ=/var/cache/intel-npu/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
+    if [ ! -f "$TGZ" ]; then \
+        wget -q -O "$TGZ" https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
+    fi; \
+    DEB=/var/cache/intel-npu/libze1_${LIBZE1_VERSION}_amd64.deb; \
+    if [ ! -f "$DEB" ]; then \
+        wget -q -O "$DEB" https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb; \
+    fi; \
+    mkdir /tmp/npu/ && cd /tmp/npu/ && tar -xf "$TGZ" && cp "$DEB" .; \
+    apt-get update; \
+    apt-get install -y --no-install-recommends ./*.deb; \
+    rm -rf /tmp/npu/ /var/lib/apt/lists/*

 COPY --from=build /app/lib/ /app/

@@ -166,22 +179,26 @@ RUN apt-get update && \
    python3 \
    python3-venv \
    python3-pip && \
-    python3 -m venv /ov-venv && \
-    /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
-    /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
+    python3 -m venv /openvino-venv && \
+    /openvino-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    /openvino-venv/bin/pip install --no-cache-dir -r requirements.txt && \
    apt-get autoremove -y && \
    apt-get clean && \
    rm -rf /tmp/* /var/tmp/* && \
    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
    find /var/cache -type f -delete

-ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
+# Activate the venv
+ENV VIRTUAL_ENV=/openvino-venv \
+    PATH=/openvino-venv/bin:$PATH
+
+ENTRYPOINT ["/app/tools.sh"]


 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app/

 WORKDIR /app

@@ -5,7 +5,7 @@ ARG ROCM_VERSION=7.2.1
 ARG AMDGPU_VERSION=7.2.1

 # Target the ROCm build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+ARG BASE_ROCM_DEV_CONTAINER=docker.io/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -5,7 +5,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ### Build Llama.cpp stage
-FROM gcc:${GCC_VERSION} AS build
+FROM docker.io/gcc:${GCC_VERSION} AS build

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
@@ -55,7 +55,7 @@ COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion


 ### Base image
-FROM ubuntu:${UBUNTU_VERSION} AS base
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils
@@ -33,7 +33,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
@@ -30,7 +30,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -0,0 +1,24 @@
+name: "Windows - Setup OpenVINO Toolkit"
+description: "Setup OpenVINO Toolkit for Windows"
+inputs:
+  path:
+    description: "Installation path"
+    required: true
+  version_major:
+    description: "OpenVINO major version (e.g., 2026.2)"
+    required: true
+  version_full:
+    description: "OpenVINO full version"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Download and extract OpenVINO Runtime
+      shell: powershell
+      run: |
+        $url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/windows/openvino_toolkit_windows_${{ inputs.version_full }}_x86_64.zip"
+        $out = "openvino.zip"
+        Invoke-WebRequest -Uri $url -OutFile $out
+        Expand-Archive -Path $out -DestinationPath ${{ inputs.path }} -Force
+        Remove-Item $out
@@ -68,8 +68,8 @@ jobs:

    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Clone
@@ -91,6 +91,34 @@ jobs:
          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
          version_full: ${{ env.OPENVINO_VERSION_FULL }}

+  windows-2022-openvino-cache:
+    runs-on: windows-2022
+
+    env:
+      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Setup Cache
+        uses: actions/cache@v5
+        id: cache-openvino
+        with:
+          path: ./openvino_toolkit
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+      - name: Setup OpenVINO Toolkit
+        if: steps.cache-openvino.outputs.cache-hit != 'true'
+        uses: ./.github/actions/windows-setup-openvino
+        with:
+          path: ./openvino_toolkit
+          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+          version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
  windows-2022-rocm-cache:
    runs-on: windows-2022

@@ -37,14 +37,10 @@ jobs:
  ubuntu-24-openvino:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

-    concurrency:
-      group: openvino-gpu-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Clone
@@ -78,7 +74,7 @@ jobs:
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENVINO=ON
-          time cmake --build build/ReleaseOV --config Release -j $(nproc)
+          time cmake --build build/ReleaseOV --config Release --parallel

      - name: Test (CPU)
        id: cmake_test_cpu
@@ -93,4 +89,81 @@ jobs:
        run: |
          cd ${{ github.workspace }}
          export GGML_OPENVINO_DEVICE=GPU
-          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
+          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 3000
+
+  openvino-windows-2022:
+    runs-on: windows-2022
+
+    env:
+      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: openvino-windows-2022
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Setup Cache
+        uses: actions/cache@v5
+        id: cache-openvino
+        with:
+          path: ./openvino_toolkit
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+      - name: Setup OpenVINO Toolkit
+        if: steps.cache-openvino.outputs.cache-hit != 'true'
+        uses: ./.github/actions/windows-setup-openvino
+        with:
+          path: ./openvino_toolkit
+          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+          version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+      - name: Install OpenCL using vcpkg
+        shell: powershell
+        run: |
+          git clone https://github.com/microsoft/vcpkg C:\vcpkg
+          C:\vcpkg\bootstrap-vcpkg.bat
+          C:\vcpkg\vcpkg install opencl
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        run: |
+          REM Find extracted OpenVINO folder dynamically
+          for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
+
+          if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
+              echo ERROR: OpenVINOConfig.cmake not found
+              exit /b 1
+          )
+
+          call "%OPENVINO_ROOT%\setupvars.bat"
+
+          cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
+            -A x64 ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DGGML_OPENVINO=ON ^
+            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+
+          cmake --build build\ReleaseOV --config Release -- /m
+
+      - name: Test (CPU)
+        id: cmake_test_cpu
+        shell: cmd
+        # TODO: fix and re-enable the `test-llama-archs` test below
+        run: |
+          REM Find extracted OpenVINO folder dynamically
+          for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
+          call "%OPENVINO_ROOT%\setupvars.bat"
+
+          cd build
+          ctest --test-dir ReleaseOV -L main -E "test-llama-archs" -C Release --verbose --timeout 3000
@@ -264,14 +264,10 @@ jobs:
  gpu-openvino-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

-    concurrency:
-      group: openvino-gpu-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Clone
@@ -46,11 +46,13 @@ jobs:

    steps:
      - id: check
+        env:
+          COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
        run: |
          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
            echo "should_release=true" >> $GITHUB_OUTPUT
          elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
-            if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
+            if echo "$COMMIT_MESSAGE" | grep -q '\[no release\]'; then
              echo "should_release=false" >> $GITHUB_OUTPUT
            else
              echo "should_release=true" >> $GITHUB_OUTPUT
@@ -443,9 +445,9 @@ jobs:
      openvino_version: ${{ steps.openvino_version.outputs.value }}

    env:
-      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Set OpenVINO version output
@@ -528,6 +530,109 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

+  windows-openvino:
+    runs-on: windows-2022
+
+    outputs:
+      openvino_version: ${{ steps.openvino_version.outputs.value }}
+
+    env:
+      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+
+    steps:
+      - name: Set OpenVINO version output
+        id: openvino_version
+        shell: bash
+        run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+            fetch-depth: 0
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2022-openvino
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Setup Cache
+        uses: actions/cache@v5
+        id: cache-openvino
+        with:
+          path: ./openvino_toolkit
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+      - name: Setup OpenVINO Toolkit
+        if: steps.cache-openvino.outputs.cache-hit != 'true'
+        uses: ./.github/actions/windows-setup-openvino
+        with:
+          path: ./openvino_toolkit
+          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+          version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+      - name: Install OpenCL using vcpkg
+        shell: powershell
+        run: |
+          git clone https://github.com/microsoft/vcpkg C:\vcpkg
+          C:\vcpkg\bootstrap-vcpkg.bat
+          C:\vcpkg\vcpkg install opencl
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        run: |
+          REM Find extracted OpenVINO folder dynamically
+          for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
+
+          if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
+              echo ERROR: OpenVINOConfig.cmake not found
+              exit /b 1
+          )
+
+          call "%OPENVINO_ROOT%\setupvars.bat"
+
+          cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
+            -A x64 ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DGGML_OPENVINO=ON ^
+            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+
+          cmake --build build\ReleaseOV --config Release -- /m
+
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-openvino
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        shell: powershell
+        run: |
+          Copy-Item LICENSE .\build\ReleaseOV\bin\
+          7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
+          name: llama-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
+
  windows-cpu:
    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
@@ -1403,6 +1508,7 @@ jobs:
      - windows-cuda
      #- windows-sycl
      - windows-hip
+      - windows-openvino
      - ubuntu-22-rocm
      - ubuntu-cpu
      - ubuntu-vulkan
@@ -1524,6 +1630,7 @@ jobs:
            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
+            - [Windows x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ needs.windows-openvino.outputs.openvino_version }}-x64.zip)
            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)

@@ -25,13 +25,3 @@ Commits:
 - Do not explicitly set the git author in commits - rely on the default git config
 - Always use `--no-gpg-sign` when committing
 - Never `git push` without explicit confirmation from the user
-
-Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
@@ -37,7 +37,7 @@ LLM inference in C/C++

 Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:

- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
+- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md)
 - Run with Docker - see our [Docker documentation](docs/docker.md)
 - Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
 - Build from source by cloning this repository - check out [our build guide](docs/build.md)
@@ -20,16 +20,21 @@ int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);

-// hands the update over to the install script, which downloads and swaps the binary
+// Self-update is only supported for binaries built with llama-install.sh
 static int llama_update(int argc, char ** argv) {
    (void) argc;
    (void) argv;

+#ifdef LLAMA_INSTALL_BUILD
 #if defined(_WIN32)
    return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
 #else
    return system("curl -fsSL https://llama.app/install.sh | sh");
 #endif
+#else
+    printf("Updates are available only when installed from https://llama.app\n");
+    return 1;
+#endif
 }

 static const char * progname;
@@ -46,21 +51,29 @@ struct command {
    int (*func)(int, char **);
 };

+#ifdef LLAMA_INSTALL_BUILD
+#define UPDATE_HIDDEN false
+#else
+#define UPDATE_HIDDEN true
+#endif
+
 static const command cmds[] = {
-    {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
-    {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
-    {"update",        "Update llama to the latest release",                 {},           false, llama_update       },
-    {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
-    {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
-    {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
-    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
-    {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
-    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
-    {"version",       "Show version",                                       {},           false, version            },
-    {"licenses",      "Show third-party licenses",                          {"credits"},  false, licenses           },
-    {"help",          "Show available commands",                            {},           false, help               },
+    {"serve",         "HTTP API server",                                    {"server"},   false,         llama_server       },
+    {"cli",           "Command-line interactive interface",                 {"client"},   false,         llama_cli          },
+    {"update",        "Update llama to the latest release",                 {},           UPDATE_HIDDEN, llama_update       },
+    {"completion",    "Text completion",                                    {"complete"}, true,          llama_completion   },
+    {"bench",         "Benchmark prompt processing and text generation",    {},           true,          llama_bench        },
+    {"batched-bench", "Benchmark batched decoding performance",             {},           true,          llama_batched_bench},
+    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,          llama_fit_params   },
+    {"quantize",      "Quantize a model",                                   {},           true,          llama_quantize     },
+    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,          llama_perplexity   },
+    {"version",       "Show version",                                       {},           false,         version            },
+    {"licenses",      "Show third-party licenses",                          {"credits"},  false,         licenses           },
+    {"help",          "Show available commands",                            {},           false,         help               },
 };

+#undef UPDATE_HIDDEN
+
 static int version(int argc, char ** argv) {
    printf("%s\n", llama_build_info());
    return 0;
@@ -285,58 +285,15 @@ static std::string clean_file_name(const std::string & fname) {
    return clean_fname;
 }

-static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
-    GGML_ASSERT(!params.model.hf_repo.empty());
-
-    // the returned hf_repo is without tag
-    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
-
-    // "latest" tag (default if not specified) is translated to "default" preset
-    if (hf_tag == "latest") {
-        hf_tag = "default";
-    }
-
-    std::string model_endpoint = common_get_model_endpoint();
-    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
-
-    // prepare local path for caching
-    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
-    auto preset_path = fs_get_cache_file(preset_fname);
-    common_download_opts opts;
-    opts.bearer_token = params.hf_token;
-    opts.offline = params.offline;
-
-    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
-    const int status = common_download_file_single(preset_url, preset_path, opts);
-    const bool has_preset = status >= 200 && status < 400;
-
-    // remote preset is optional, so we don't error out if not found
-    if (has_preset) {
-        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
-        common_preset_context ctx(ex, /* only_remote_allowed */ true);
-        common_preset global;
-        auto remote_presets = ctx.load_from_ini(preset_path, global);
-        remote_presets = ctx.cascade(global, remote_presets);
-        if (remote_presets.find(hf_tag) != remote_presets.end()) {
-            common_preset preset = remote_presets.at(hf_tag);
-            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
-            preset.apply_to_params(params);
-        } else {
-            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
-        }
-    } else {
-        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
-    }
-
-    return has_preset;
-}
-
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;

    bool found_mtp = false;
    common_params_model mtp;
+
+    bool found_preset = false;
+    std::string preset_path;
 };

 static handle_model_result common_params_handle_model(struct common_params_model & model,
@@ -355,6 +312,12 @@ static handle_model_result common_params_handle_model(struct common_params_model
        common_download_opts hf_opts = opts;
        auto download_result = common_download_model(model, hf_opts);

+        if (!download_result.preset_path.empty()) {
+            result.found_preset = true;
+            result.preset_path = download_result.preset_path;
+            return result; // skip everything else if preset.ini is used
+        }
+
        if (download_result.model_path.empty()) {
            throw std::runtime_error("failed to download model from Hugging Face");
        }
@@ -454,6 +417,17 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)

    try {
        auto res = common_params_handle_model(params.model, opts);
+        if (res.found_preset) {
+            if (!params.models_preset.empty()) {
+                throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file");
+            }
+            // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
+            params.models_preset_hf = params.model.hf_repo; // only for showing a warning
+            params.models_preset    = res.preset_path;
+            params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
+            return true;
+        }
+
        if (params.no_mmproj) {
            params.mmproj = {};
        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -601,30 +575,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    // parse the first time to get -hf option (used for remote preset)
    parse_cli_args();

-    // export_graph_ops loads only metadata
-    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
-
-    // maybe handle remote preset
-    if (!params.model.hf_repo.empty() && !skip_model_download) {
-        std::string cli_hf_repo = params.model.hf_repo;
-        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
-
-        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
-        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
-        std::string preset_hf_repo = params.model.hf_repo;
-        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
-
-        if (has_preset) {
-            // re-parse CLI args to override preset values
-            parse_cli_args();
-        }
-
-        // preserve hf_repo from preset if needed
-        if (preset_has_hf_repo) {
-            params.model.hf_repo = preset_hf_repo;
-        }
-    }
-
    postprocess_cpu_params(params.cpuparams,       nullptr);
    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);

@@ -635,15 +585,21 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    // handle model and download
-    if (!skip_model_download) {
-        common_params_handle_models(params, ctx_arg.ex);
-    }
+    // export_graph_ops loads only metadata
+    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;

-    // model is required (except for server)
-    // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
-        throw std::invalid_argument("error: --model is required\n");
+    if (!skip_model_download) {
+        // handle model and download
+        common_params_handle_models(params, ctx_arg.ex);
+
+        // model is required (except for server)
+        // TODO @ngxson : maybe show a list of available models in CLI in this case
+        if (params.model.path.empty()
+                && ctx_arg.ex != LLAMA_EXAMPLE_SERVER
+                && !params.usage
+                && !params.completion) {
+            throw std::invalid_argument("error: --model is required\n");
+        }
    }

    if (params.escape) {
@@ -103,6 +103,10 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
            data.grammar_triggers = {
                { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
            };
+            if (autoparser.tools.format.openai_wrapper_trigger) {
+                // model emits the OpenAI function wrapper, trigger on it
+                data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "{\"type\": \"function\"," });
+            }
        }
    }

@@ -134,7 +138,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs, cons
            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
            parser = ctx.reasoning_parser + p.space() + p.choice({
                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
-                response_format
+                p.space() + response_format  + p.space()
            }) + p.end();
            pure_content = false;
        } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
@@ -224,13 +228,13 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        auto single_tool_parser = p.standard_json_tools(
            format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
        tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
    } else {
        tools_parser = p.standard_json_tools(
            format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
    }

    // Handle content wrappers if present
@@ -393,8 +397,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                           (schema_info.resolves_to_string(param_schema) ?
                                p.tool_arg_string_value(until_suffix) :
                                p.tool_arg_json_value(p.schema(
-                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
-                                    p.space()) +
+                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false))) +
                           p.tool_arg_close(p.literal(arguments.value_suffix)));

            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
@@ -181,6 +181,7 @@ struct tool_format_analysis {

    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
+    bool openai_wrapper_trigger = false;  // model emits the OpenAI function wrapper, trigger on it

    std::string              function_field = "function";
    std::string              name_field     = "name";
@@ -165,6 +165,14 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
          }
      },
+      // template uses the JSON {name, parameters} tool instruction, emits the OpenAI function wrapper
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("Respond in the format {\"name\": function name") != std::string::npos &&
+              tmpl.src.find("Do not use variables.") != std::string::npos) {
+              analysis.tools.format.openai_wrapper_trigger = true;
+              LOG_DBG(ANSI_ORANGE "[Patch: JSON name/parameters tool instruction]\n" ANSI_RESET);
+          }
+      },

    });

@@ -1229,8 +1237,8 @@ void analyze_tools::extract_argument_name_markers() {
            left_result.tags["pre"] == right_result.tags["pre"] &&
            left_result.tags["suffix"] == right_result.tags["suffix"]) {
            // Name is inside a structure (e.g., JSON key): prefix is the shared wrapper
-            arguments.name_prefix = trim_whitespace(left_result.tags["pre"]);
-            arguments.name_suffix = trim_leading_whitespace(left_result.tags["suffix"]);
+            arguments.name_prefix = left_result.tags["pre"];
+            arguments.name_suffix = left_result.tags["suffix"];
        } else if (diff.left.substr(0, ARG_FIRST.length()) == ARG_FIRST && diff.right.substr(0, ARG_SECOND.length()) == ARG_SECOND) {
            // Name is directly in the diff: prefix comes from last marker in diff.prefix
            auto pre_parser = build_tagged_peg_parser([&](common_peg_parser_builder & p) {
@@ -1315,8 +1323,7 @@ void analyze_tools::extract_argument_value_markers() {
                value_suffix = value_suffix.substr(0, end_marker_pos);
            }
        }
-        value_suffix = trim_leading_whitespace(value_suffix);
-        if (!value_suffix.empty()) {
+        if (!trim_whitespace(value_suffix).empty()) {
            arguments.value_suffix = value_suffix;
        }
    }
@@ -363,7 +363,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
    }

    if ((is_arg_value || is_arg_string_value) && current_tool) {
-        std::string value_content = std::string(trim_trailing_space(trim_leading_space(node.text, 1), 1));
+        std::string value_content = std::string(node.text);

        std::string value_to_add;
        if (value_content.empty() && is_arg_string_value) {
@@ -540,10 +540,11 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
                auto arg_name_parser = literal(prop_name);

                common_peg_parser arg_value_parser = eps();
-                auto string_value_parser = choice({
-                    literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
-                    literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
-                });
+                // Quoted literal as a value: normalize_quotes_to_json preserves escapes.
+                auto string_value_parser = tool_arg_value(choice({
+                    literal("\"") + string_content('"') + literal("\""),
+                    literal("'") + string_content('\'') + literal("'")
+                }));

                if (is_string_type) {
                    arg_value_parser = string_value_parser;
@@ -745,7 +746,8 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
    const std::string &              effective_args_key,
    const std::string &              call_id_key,
    const std::string &              gen_call_id_key,
-    const std::vector<std::string> & parameters_order) {
+    const std::vector<std::string> & parameters_order,
+    bool                             accept_openai_wrapper) {

    auto tool_choices    = choice();
    auto name_key_parser = literal("\"" + effective_name_key + "\"");
@@ -807,7 +809,13 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
                return idx_a < idx_b;
            });

-        auto ordered_body = tool_open(literal("{")) + space();
+        // accept an optional leading "type": "function" field when the model emits the OpenAI wrapper
+        common_peg_parser type_field = eps();
+        if (accept_openai_wrapper) {
+            type_field = optional(literal("\"type\"") + space() + literal(":") + space() +
+                                  literal("\"function\"") + space() + literal(",") + space());
+        }
+        auto ordered_body = tool_open(literal("{")) + space() + type_field;
        for (size_t i = 0; i < parser_pairs.size(); i++) {
            ordered_body = ordered_body + parser_pairs[i].first;
            if (i < parser_pairs.size() - 1) {
@@ -870,7 +878,8 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       bool                             function_is_key,
                                                       const std::string &              call_id_key,
                                                       const std::string &              gen_call_id_key,
-                                                       const std::vector<std::string> & parameters_order) {
+                                                       const std::vector<std::string> & parameters_order,
+                                                       bool                             accept_openai_wrapper) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
@@ -888,7 +897,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
        if (!name_spec.first.empty() || !args_spec.first.empty()) {
            tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
        } else {
-            tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
+            tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order, accept_openai_wrapper);
        }
    }

@@ -120,7 +120,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
                                          bool                             function_is_key = false,
                                          const std::string &              call_id_key = "",
                                          const std::string &              gen_call_id_key = "",
-                                          const std::vector<std::string> & parameters_order = {});
+                                          const std::vector<std::string> & parameters_order = {},
+                                          bool                             accept_openai_wrapper = false);

    // Legacy-compatible helper for building XML/tagged style tool calls
    // Used by tests and manual parsers
@@ -157,7 +158,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
                                                 const std::string &              effective_args_key,
                                                 const std::string &              call_id_key,
                                                 const std::string &              gen_call_id_key,
-                                                 const std::vector<std::string> & parameters_order);
+                                                 const std::vector<std::string> & parameters_order,
+                                                 bool                             accept_openai_wrapper);
 };

 inline common_peg_arena build_chat_peg_parser(
@@ -1979,6 +1979,146 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
    return data;
 }

+// Cohere2 MoE (a.k.a. "North Code") parser.
+//
+// The assistant turn is fully marker-wrapped:
+//   <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+//     <|START_THINKING|>{reasoning}<|END_THINKING|>
+//     then EITHER content:    <|START_TEXT|>{content}<|END_TEXT|>
+//          OR     tool calls: <|START_ACTION|>[
+//                                 {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ...
+//                             ]<|END_ACTION|>
+//   <|END_OF_TURN_TOKEN|>
+//
+// The generation prompt forces a leading <|START_THINKING|> (when reasoning is enabled, which is
+// the template default), so the model's output continues from *inside* the thinking block. The
+// parser literal therefore only covers the stable <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> prefix
+// and the reasoning rule consumes the <|START_THINKING|> ... <|END_THINKING|> markers itself,
+// regardless of whether they came from the generation prompt or the generated text.
+static common_chat_params common_chat_params_init_cohere2moe(const common_chat_template &          tmpl,
+                                                              const autoparser::generation_params & inputs) {
+    common_chat_params data;
+
+    const std::string TURN_START    = "<|START_OF_TURN_TOKEN|>";
+    const std::string TURN_END      = "<|END_OF_TURN_TOKEN|>";
+    const std::string CHATBOT       = "<|CHATBOT_TOKEN|>";
+    const std::string USER          = "<|USER_TOKEN|>";
+    const std::string SYSTEM        = "<|SYSTEM_TOKEN|>";
+    const std::string THINK_START   = "<|START_THINKING|>";
+    const std::string THINK_END     = "<|END_THINKING|>";
+    const std::string TEXT_START    = "<|START_TEXT|>";
+    const std::string TEXT_END      = "<|END_TEXT|>";
+    const std::string ACTION_START  = "<|START_ACTION|>";
+    const std::string ACTION_END    = "<|END_ACTION|>";
+    const std::string RESULT_START  = "<|START_TOOL_RESULT|>";
+    const std::string RESULT_END    = "<|END_TOOL_RESULT|>";
+
+    // Stable prefix of the generation prompt that precedes the (forced) <|START_THINKING|> marker.
+    const std::string GEN_PREFIX = TURN_START + CHATBOT;
+
+    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt  = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking  = true;
+    data.thinking_start_tag = THINK_START;
+    data.thinking_end_tag   = THINK_END;
+    data.preserved_tokens   = {
+        TURN_START, TURN_END, CHATBOT, USER, SYSTEM,
+        THINK_START, THINK_END,
+        TEXT_START, TEXT_END,
+        ACTION_START, ACTION_END,
+        RESULT_START, RESULT_END,
+    };
+
+    // Split the rendered prompt into per-role message spans. Tool results are rendered with the
+    // system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
+    // the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "assistant", GEN_PREFIX },
+        { "user",      TURN_START + USER },
+        { "tool",      TURN_START + SYSTEM + RESULT_START },
+        { "system",    TURN_START + SYSTEM },
+    });
+
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+
+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PREFIX + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + TEXT_START + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto generation_prompt = p.literal(GEN_PREFIX);
+        auto end               = p.end();
+
+        // The thinking block is always present (the generation prompt forces <|START_THINKING|>).
+        // When extracting reasoning, capture its body; otherwise keep the whole block (markers
+        // included) inline as content, matching reasoning_format=NONE conventions.
+        common_peg_parser reasoning = p.eps();
+        if (extract_reasoning) {
+            reasoning = p.optional(p.literal(THINK_START) +
+                                   p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) +
+                                   p.optional(p.literal(THINK_END)));
+        } else {
+            reasoning = p.optional(p.content(p.literal(THINK_START) +
+                                             p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) +
+                                             p.optional(p.literal(THINK_END))));
+        }
+
+        auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END));
+
+        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+            return generation_prompt + reasoning + text_content + p.optional(p.literal(TURN_END)) + end;
+        }
+
+        auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+        // <|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ... ]<|END_ACTION|>
+        auto tool_calls = p.standard_json_tools(ACTION_START, ACTION_END, inputs.tools, inputs.parallel_tool_calls,
+                                                /* force_tool_calls = */ true,
+                                                /* name_key         = */ "tool_name",
+                                                /* args_key         = */ "parameters",
+                                                /* array_wrapped    = */ true,
+                                                /* function_is_key  = */ false,
+                                                /* call_id_key      = */ "",
+                                                /* gen_call_id_key  = */ "tool_call_id",
+                                                /* parameters_order = */ { "tool_call_id", "tool_name", "parameters" });
+
+        // Content and tool calls are mutually exclusive in this format.
+        common_peg_parser body = require_tools ? tool_calls : p.choice({ tool_calls, text_content });
+
+        return generation_prompt + reasoning + body + p.optional(p.literal(TURN_END)) + end;
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto         schema   = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+
+        data.grammar_triggers = {
+            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ACTION_START }
+        };
+    }
+
+    return data;
+}
+
 namespace workaround {

 static void map_developer_role_to_system(json & messages) {
@@ -2227,6 +2367,15 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

+    // Cohere2 MoE / North Code - marker-wrapped format with <|START_TEXT|> content and
+    // <|START_ACTION|> JSON tool calls. <|START_TEXT|> is unique to this template (the older
+    // Command-R templates use <|START_RESPONSE|>).
+    if (src.find("<|START_TEXT|>") != std::string::npos &&
+        src.find("<|START_ACTION|>") != std::string::npos) {
+        LOG_DBG("Using specialized template: Cohere2 MoE\n");
+        return common_chat_params_init_cohere2moe(tmpl, params);
+    }
+
    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
@@ -2529,8 +2678,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
            }
            return msg;
        }
-        throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
-                                 effective_input.substr(result.end));
+        LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str());
+        LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str());
+        throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format");
    }

    common_chat_msg msg;
@@ -642,10 +642,11 @@ struct common_params {
    std::vector<std::string> server_tools;

    // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
-    bool models_autoload = true;    // automatically load models when requested via the router server
+    std::string models_dir    = "";     // directory containing models for the router server
+    std::string models_preset = "";     // directory containing model presets for the router server
+    int models_max = 4;                 // maximum number of models to load simultaneously
+    bool models_autoload = true;        // automatically load models when requested via the router server
+    std::string models_preset_hf = "";  // show a warning about remote presets on router loaded (if not empty)

    bool log_json = false;

@@ -696,6 +696,7 @@ struct hf_plan {
    hf_cache::hf_files model_files;
    hf_cache::hf_file mmproj;
    hf_cache::hf_file mtp;
+    hf_cache::hf_file preset; // if set, only this file is downloaded
 };

 static hf_plan get_hf_plan(const common_params_model  & model,
@@ -717,6 +718,14 @@ static hf_plan get_hf_plan(const common_params_model  & model,
        return plan;
    }

+    // if preset.ini exists in the repo root, download only that file
+    for (const auto & f : all) {
+        if (f.path == "preset.ini") {
+            plan.preset = f;
+            return plan;
+        }
+    }
+
    hf_cache::hf_file primary;

    if (!model.hf_file.empty()) {
@@ -794,14 +803,19 @@ common_download_model_result common_download_model(const common_params_model  &

    if (is_hf) {
        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
-        for (const auto & f : hf.model_files) {
-            tasks.push_back({f.url, f.local_path});
-        }
-        if (!hf.mmproj.path.empty()) {
-            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
-        }
-        if (!hf.mtp.path.empty()) {
-            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+        if (!hf.preset.path.empty()) {
+            // if preset.ini exists, only download that file alone
+            tasks.push_back({hf.preset.url, hf.preset.local_path});
+        } else {
+            for (const auto & f : hf.model_files) {
+                tasks.push_back({f.url, f.local_path});
+            }
+            if (!hf.mmproj.path.empty()) {
+                tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
+            }
+            if (!hf.mtp.path.empty()) {
+                tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+            }
        }
    } else if (!model.url.empty()) {
        tasks = get_url_tasks(model);
@@ -835,17 +849,22 @@ common_download_model_result common_download_model(const common_params_model  &
    }

    if (is_hf) {
-        for (const auto & f : hf.model_files) {
-            hf_cache::finalize_file(f);
-        }
-        result.model_path = hf.primary.final_path;
+        if (!hf.preset.path.empty()) {
+            // if preset.ini is used, do not set other paths
+            result.preset_path = hf_cache::finalize_file(hf.preset);
+        } else {
+            for (const auto & f : hf.model_files) {
+                hf_cache::finalize_file(f);
+            }
+            result.model_path = hf.primary.final_path;

-        if (!hf.mmproj.path.empty()) {
-            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
-        }
+            if (!hf.mmproj.path.empty()) {
+                result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
+            }

-        if (!hf.mtp.path.empty()) {
-            result.mtp_path = hf_cache::finalize_file(hf.mtp);
+            if (!hf.mtp.path.empty()) {
+                result.mtp_path = hf_cache::finalize_file(hf.mtp);
+            }
        }
    } else {
        result.model_path = model.path;
@@ -997,3 +1016,87 @@ std::vector<common_cached_model_info> common_list_cached_models() {

    return result;
 }
+
+bool common_download_remove(const std::string & hf_repo_with_tag) {
+    namespace fs = std::filesystem;
+
+    auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag);
+
+    if (tag.empty()) {
+        return hf_cache::remove_cached_repo(repo_id);
+    }
+
+    std::string tag_upper = tag;
+    for (char & c : tag_upper) {
+        c = (char) std::toupper((unsigned char) c);
+    }
+
+    auto files = hf_cache::get_cached_files(repo_id);
+    if (files.empty()) {
+        return false;
+    }
+
+    // collect snapshot entries whose tag matches
+    std::vector<fs::path> to_remove;
+    for (const auto & f : files) {
+        auto split = get_gguf_split_info(f.path);
+        if (split.tag == tag_upper) {
+            to_remove.emplace_back(f.local_path);
+        }
+    }
+
+    if (to_remove.empty()) {
+        return false;
+    }
+
+    // resolve blob paths from symlinks before deleting snapshot entries
+    std::vector<fs::path> blobs_to_check;
+    for (const auto & p : to_remove) {
+        std::error_code ec;
+        if (fs::is_symlink(p, ec)) {
+            auto target = fs::read_symlink(p, ec);
+            if (!ec) {
+                blobs_to_check.push_back((p.parent_path() / target).lexically_normal());
+            }
+        }
+    }
+
+    // remove snapshot entries
+    for (const auto & p : to_remove) {
+        std::error_code ec;
+        fs::remove(p, ec);
+        if (ec) {
+            LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str());
+        }
+    }
+
+    if (blobs_to_check.empty()) {
+        return true;
+    }
+
+    // collect blobs still referenced by remaining snapshot entries
+    std::unordered_set<std::string> still_referenced;
+    for (const auto & f : hf_cache::get_cached_files(repo_id)) {
+        fs::path p(f.local_path);
+        std::error_code ec;
+        if (fs::is_symlink(p, ec)) {
+            auto target = fs::read_symlink(p, ec);
+            if (!ec) {
+                still_referenced.insert((p.parent_path() / target).lexically_normal().string());
+            }
+        }
+    }
+
+    // remove orphaned blobs
+    for (const auto & blob : blobs_to_check) {
+        if (still_referenced.find(blob.string()) == still_referenced.end()) {
+            std::error_code ec;
+            fs::remove(blob, ec);
+            if (ec) {
+                LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str());
+            }
+        }
+    }
+
+    return true;
+}
@@ -63,6 +63,7 @@ struct common_download_model_result {
    std::string model_path;
    std::string mmproj_path;
    std::string mtp_path;
+    std::string preset_path;
 };

 // throw if the file is missing or invalid (e.g. ETag check failed)
@@ -115,3 +116,10 @@ int common_download_file_single(const std::string & url,
 // resolve and download model from Docker registry
 // return local path to downloaded model file
 std::string common_docker_resolve_model(const std::string & docker);
+
+// Remove a cached model from disk
+// input format: "user/model" or "user/model:tag"
+// - if tag is omitted, removes the entire repo cache directory
+// - if tag is present, removes only files matching that tag (and orphaned blobs)
+// returns true if anything was removed
+bool common_download_remove(const std::string & hf_repo_with_tag);
@@ -495,4 +495,19 @@ std::string finalize_file(const hf_file & file) {
    return file.final_path;
 }

+bool remove_cached_repo(const std::string & repo_id) {
+    if (!is_valid_repo_id(repo_id)) {
+        LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
+        return false;
+    }
+    fs::path repo_path = get_repo_path(repo_id);
+    std::error_code ec;
+    auto removed = fs::remove_all(repo_path, ec);
+    if (ec) {
+        LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str());
+        return false;
+    }
+    return removed > 0;
+}
+
 } // namespace hf_cache
@@ -29,4 +29,7 @@ hf_files get_cached_files(const std::string & repo_id = {});
 // Create snapshot path (link or move/copy) and return it
 std::string finalize_file(const hf_file & file);

+// Remove the entire cached directory for a repo, returns true if removed
+bool remove_cached_repo(const std::string & repo_id);
+
 } // namespace hf_cache
@@ -316,12 +316,22 @@ value filter_expression::execute_impl(context & ctx) {

    JJ_DEBUG("Applying filter to %s", input->type().c_str());

+    auto set_filter_alias = [](auto & filter_id) {
+        if (filter_id == "count") {
+            filter_id = "length";
+        } else if (filter_id == "d") {
+            filter_id = "default";
+        } else if (filter_id == "e") {
+            filter_id = "escape";
+        } else if (filter_id == "trim") {
+            filter_id = "strip";
+        }
+    };
+
    if (is_stmt<identifier>(filter)) {
        auto filter_id = cast_stmt<identifier>(filter)->val;

-        if (filter_id == "trim") {
-            filter_id = "strip"; // alias
-        }
+        set_filter_alias(filter_id);
        JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
        // TODO: Refactor filters so this coercion can be done automatically
        if (!input->is_undefined() && !is_val<value_string>(input) && (
@@ -345,9 +355,7 @@ value filter_expression::execute_impl(context & ctx) {
        }
        auto filter_id = cast_stmt<identifier>(call->callee)->val;

-        if (filter_id == "trim") {
-            filter_id = "strip"; // alias
-        }
+        set_filter_alias(filter_id);
        JJ_DEBUG("Applying filter '%s' with arguments to %s", filter_id.c_str(), input->type().c_str());
        func_args args(ctx);
        for (const auto & arg_expr : call->args) {
@@ -11,8 +11,13 @@
 #include <sstream>
 #include <thread>
 #include <vector>
+#include <algorithm>

 #if defined(_WIN32)
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#       define NOMINMAX
+#    endif
 #    include <io.h>
 #    include <windows.h>
 #    define isatty _isatty
@@ -62,16 +67,15 @@ static const char* g_col[] = {
 };

 struct common_log_entry {
-    enum ggml_log_level level;
-
-    bool prefix;
-
-    int64_t timestamp;
+    enum ggml_log_level level {GGML_LOG_LEVEL_INFO};

    std::vector<char> msg;

-    // signals the worker thread to stop
-    bool is_end;
+    int64_t timestamp { 0 };
+    bool is_end       { false }; // signals the worker thread to stop
+    bool prefix       { false };
+
+    common_log_entry(size_t size = 256) : msg(size) { }

    void print(FILE * file = nullptr) const {
        FILE * fcur = file;
@@ -122,22 +126,15 @@ struct common_log_entry {
 };

 struct common_log {
-    // default capacity - will be expanded if needed
-    common_log() : common_log(256) {}
-
-    common_log(size_t capacity) {
-        file = nullptr;
-        prefix = false;
+    // default capacity
+    common_log(size_t capacity = 512) {
+        file       = nullptr;
+        prefix     = false;
        timestamps = false;
-        running = false;
-        t_start = t_us();
-
-        // initial message size - will be expanded if longer messages arrive
-        entries.resize(capacity);
-        for (auto & entry : entries) {
-            entry.msg.resize(256);
-        }
+        running    = false;
+        t_start    = t_us();

+        queue.resize(capacity, common_log_entry(256));
        head = 0;
        tail = 0;

@@ -152,9 +149,10 @@ struct common_log {
    }

 private:
-    std::mutex mtx;
-    std::thread thrd;
-    std::condition_variable cv;
+    std::mutex              mtx;
+    std::thread             thrd;
+    std::condition_variable cv_new;  // new entry
+    std::condition_variable cv_full; // wait on full

    FILE * file;

@@ -164,24 +162,53 @@ private:

    int64_t t_start;

-    // ring buffer of entries
-    std::vector<common_log_entry> entries;
+    // queue of entries
+    std::vector<common_log_entry> queue;
    size_t head;
    size_t tail;

-    // worker thread copies into this
-    common_log_entry cur;
+    bool print_entry(const common_log_entry & e) const {
+        if (e.is_end) return true;
+
+        e.print();
+        if (file) {
+            e.print(file);
+        }
+        return false;
+    }
+
+    bool flush_queue(size_t start_head, size_t end_tail, size_t & out_head) const {
+        bool stop = false;
+        size_t h = start_head;
+        while (h != end_tail && !stop) {
+            stop = print_entry(queue[h]);
+            h = (h + 1) % queue.size();
+        }
+        out_head = h;
+        return stop;
+    }

 public:
+    bool is_full() const {
+        return ((tail + 1) % queue.size()) == head;
+    }
+
+    bool is_empty() const {
+        return head == tail;
+    }
+
    void add(enum ggml_log_level level, const char * fmt, va_list args) {
-        std::lock_guard<std::mutex> lock(mtx);
+        std::unique_lock<std::mutex> lock(mtx);
+
+        // block if the queue is full
+        cv_full.wait(lock, [this]() { return !running || !is_full(); });

        if (!running) {
            // discard messages while the worker thread is paused
            return;
        }

-        auto & entry = entries[tail];
+        auto & entry = queue[tail];

        {
            // cannot use args twice, so make a copy in case we need to expand the buffer
@@ -216,38 +243,16 @@ public:
            va_end(args_copy);
        }

-        entry.level = level;
-        entry.prefix = prefix;
+        entry.is_end    = false;
+        entry.level     = level;
+        entry.prefix    = prefix;
        entry.timestamp = 0;
        if (timestamps) {
            entry.timestamp = t_us() - t_start;
        }
-        entry.is_end = false;

-        tail = (tail + 1) % entries.size();
-        if (tail == head) {
-            // expand the buffer
-            std::vector<common_log_entry> new_entries(2*entries.size());
-
-            size_t new_tail = 0;
-
-            do {
-                new_entries[new_tail] = std::move(entries[head]);
-
-                head     = (head     + 1) % entries.size();
-                new_tail = (new_tail + 1);
-            } while (head != tail);
-
-            head = 0;
-            tail = new_tail;
-
-            for (size_t i = tail; i < new_entries.size(); i++) {
-                new_entries[i].msg.resize(256);
-            }
-
-            entries = std::move(new_entries);
-        }
-        cv.notify_one();
+        tail = (tail + 1) % queue.size();
+        cv_new.notify_one();
    }

    void resume() {
@@ -261,23 +266,24 @@ public:

        thrd = std::thread([this]() {
            while (true) {
-                {
-                    std::unique_lock<std::mutex> lock(mtx);
-                    cv.wait(lock, [this]() { return head != tail; });
-                    cur = entries[head];
+                std::unique_lock<std::mutex> lock(mtx);
+                cv_new.wait(lock, [this]() { return !is_empty(); });

-                    head = (head + 1) % entries.size();
-                }
+                size_t cached_head = head;
+                size_t cached_tail = tail;

-                if (cur.is_end) {
+                lock.unlock(); // drop the lock during flush
+
+                size_t next_head;
+                bool stop = flush_queue(cached_head, cached_tail, next_head);
+
+                lock.lock();
+                head = next_head;
+                cv_full.notify_all();
+
+                if (stop) {
                    break;
                }
-
-                cur.print(); // stdout and stderr
-
-                if (file) {
-                    cur.print(file);
-                }
            }
        });
    }
@@ -293,13 +299,13 @@ public:
            running = false;

            // push an entry to signal the worker thread to stop
-            {
-                auto & entry = entries[tail];
-                entry.is_end = true;
+            auto & entry = queue[tail];
+            entry.is_end = true;
+            tail = (tail + 1) % queue.size();

-                tail = (tail + 1) % entries.size();
-            }
-            cv.notify_one();
+            // wakeup everyone
+            cv_new.notify_one();
+            cv_full.notify_all();
        }

        thrd.join();
@@ -1272,13 +1272,13 @@ common_peg_parser common_peg_parser_builder::string_content(char delimiter) {

 common_peg_parser common_peg_parser_builder::double_quoted_string() {
    return rule("double-quoted-string", [this]() {
-        return sequence({literal("\""), string_content('"'), literal("\""), space()});
+        return sequence({literal("\""), string_content('"'), literal("\"")});
    });
 }

 common_peg_parser common_peg_parser_builder::single_quoted_string() {
    return rule("single-quoted-string", [this]() {
-        return sequence({literal("'"), string_content('\''), literal("'"), space()});
+        return sequence({literal("'"), string_content('\''), literal("'")});
    });
 }

@@ -1301,25 +1301,25 @@ common_peg_parser common_peg_parser_builder::json_number() {
        // At EOF in partial mode, chars returns NEED_MORE → negate propagates NEED_MORE → number not committed.
        // This prevents premature commits of partial numbers (e.g. "3" when "3.14" is incoming).
        auto not_number_continuation = negate(chars("[0-9.eE+-]", 1, 1));
-        return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation, space() });
+        return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation });
    });
 }

 common_peg_parser common_peg_parser_builder::json_string() {
    return rule("json-string", [this]() {
-        return sequence({literal("\""), string_content('"'), literal("\""), space()});
+        return sequence({literal("\""), string_content('"'), literal("\"")});
    });
 }

 common_peg_parser common_peg_parser_builder::json_bool() {
    return rule("json-bool", [this]() {
-        return sequence({choice({literal("true"), literal("false")}), space()});
+        return choice({literal("true"), literal("false")});
    });
 }

 common_peg_parser common_peg_parser_builder::json_null() {
    return rule("json-null", [this]() {
-        return sequence({literal("null"), space()});
+        return literal("null");
    });
 }

@@ -1334,8 +1334,7 @@ common_peg_parser common_peg_parser_builder::json_object() {
            choice({
                literal("}"),
                sequence({members, ws, literal("}")})
-            }),
-            ws
+            })
        });
    });
 }
@@ -1350,8 +1349,7 @@ common_peg_parser common_peg_parser_builder::json_array() {
            choice({
                literal("]"),
                sequence({elements, ws, literal("]")})
-            }),
-            ws
+            })
        });
    });
 }
@@ -1381,16 +1379,13 @@ common_peg_parser common_peg_parser_builder::python_number() {

 common_peg_parser common_peg_parser_builder::python_bool() {
    return rule("python-bool", [this]() {
-        return sequence({
-            choice({literal("True"), literal("False")}),
-            space()
-        });
+        return choice({literal("True"), literal("False")});
    });
 }

 common_peg_parser common_peg_parser_builder::python_null() {
    return rule("python-none", [this]() {
-        return sequence({literal("None"), space()});
+        return literal("None");
    });
 }

@@ -1512,6 +1507,7 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
    auto pieces = matcher.collect_prefix_and_next();

    std::string pattern;
+    std::string trailing;  // optional proper-prefix of a delimiter, allowed only at the very end
    for (size_t i = 0; i < pieces.size(); ++i) {
        if (i > 0) {
            pattern += " | ";
@@ -1527,13 +1523,32 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
        }

        if (!pre.empty()) {
-            pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
+            std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
+            pattern += pre_literal + " [^" + cls + "]";
+            // Each interior alternative consumes a delimiter-prefix plus a disambiguating
+            // char, so the repetition alone cannot match a value that *ends* on a proper
+            // prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
+            // "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
+            // values, so without this the grammar would reject input the parser accepts.
+            // Allow the value to terminate on any proper prefix as an optional tail.
+            // This makes the grammar a slight superset of the runtime language (a value
+            // may end on the longest prefix, which greedy first-match would not itself
+            // produce); harmless for constrained generation, which only needs to admit
+            // every runtime-valid string.
+            if (!trailing.empty()) {
+                trailing += " | ";
+            }
+            trailing += pre_literal;
        } else {
            pattern += "[^" + cls + "]";
        }
    }

-    return "(" + pattern + ")*";
+    std::string result = "(" + pattern + ")*";
+    if (!trailing.empty()) {
+        result += " (" + trailing + ")?";
+    }
+    return result;
 }

 static std::unordered_set<std::string> collect_reachable_rules(
@@ -16,48 +16,6 @@ static std::string rm_leading_dashes(const std::string & str) {
    return str.substr(pos);
 }

-// only allow a subset of args for remote presets for security reasons
-// do not add more args unless absolutely necessary
-// args that output to files are strictly prohibited
-static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
-    static const std::set<std::string> allowed_options = {
-        "model-url",
-        "hf-repo",
-        "hf-repo-draft",
-        "hf-repo-v", // vocoder
-        "hf-file-v", // vocoder
-        "mmproj-url",
-        "pooling",
-        "jinja",
-        "batch-size",
-        "ubatch-size",
-        "cache-reuse",
-        "chat-template-kwargs",
-        "mmap",
-        // note: sampling params are automatically allowed by default
-        // negated args will be added automatically if the positive arg is specified above
-    };
-
-    std::set<std::string> allowed_keys;
-
-    for (const auto & it : key_to_opt) {
-        const std::string & key = it.first;
-        const common_arg & opt = it.second;
-        if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
-            allowed_keys.insert(key);
-            // also add variant keys (args without leading dashes and env vars)
-            for (const auto & arg : opt.get_args()) {
-                allowed_keys.insert(rm_leading_dashes(arg));
-            }
-            for (const auto & env : opt.get_env()) {
-                allowed_keys.insert(env);
-            }
-        }
-    }
-
-    return allowed_keys;
-}
-
 std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
    std::vector<std::string> args;

@@ -300,16 +258,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
    return value;
 }

-common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
+common_preset_context::common_preset_context(llama_example ex)
        : ctx_params(common_params_parser_init(default_params, ex)) {
    common_params_add_preset_options(ctx_params.options);
    key_to_opt = get_map_key_opt(ctx_params);
-
-    // setup allowed keys if only_remote_allowed is true
-    if (only_remote_allowed) {
-        filter_allowed_keys = true;
-        allowed_keys = get_remote_preset_whitelist(key_to_opt);
-    }
 }

 common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
@@ -60,7 +60,7 @@ struct common_preset_context {
    std::set<std::string> allowed_keys;

    // if only_remote_allowed is true, only accept whitelisted keys
-    common_preset_context(llama_example ex, bool only_remote_allowed = false);
+    common_preset_context(llama_example ex);

    // load presets from INI file
    common_presets load_from_ini(const std::string & path, common_preset & global) const;
@@ -259,6 +259,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
             }
        }
    }
+    if (!grmr && !grammar_str.empty()) {
+        throw std::runtime_error("failed to parse grammar");
+    }

    // Compute prefill tokens from the generation prompt
    std::vector<llama_token> prefill_tokens;
@@ -140,6 +140,8 @@ struct common_speculative_impl {
    size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
    size_t n_acc_tokens = 0; // number of tokens accepted by the target model.

+    std::vector<size_t> n_acc_tokens_per_pos; // number of tokens accepted per draft position.
+
    // TODO: track performance of most recent calls
    const bool gen_perf = true; // whether to generate performance stats.

@@ -416,6 +418,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {

    std::vector<common_sampler_ptr> smpls;

+    // backend sampler chain per seq, attached to ctx_dft
+    std::vector<llama_sampler *> backend_chains;
+
    int32_t n_embd_dec = 0;       // draft hidden size
    int32_t n_embd_enc = 0;       // target_layer_ids_n * target_hidden_size
    int32_t n_embd_tgt = 0;       // target model hidden size
@@ -441,7 +446,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
        , params(params.draft)
    {
        LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
-        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
+        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f, backend_sampling=%d\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min, (int) params.draft.backend_sampling);

        auto * ctx_tgt = this->params.ctx_tgt;
        auto * ctx_dft = this->params.ctx_dft;
@@ -476,6 +481,22 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
            s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
        }

+        // offload draft sampling to the backend
+        backend_chains.assign(n_seq, nullptr);
+        if (this->params.backend_sampling) {
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
+                llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
+
+                if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
+                    LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
+                    llama_sampler_free(chain);
+                    chain = nullptr;
+                }
+                backend_chains[seq_id] = chain;
+            }
+        }
+
        // turn on extraction of the target layers' input embeddings
        for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
            llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
@@ -494,6 +515,18 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
    }

    ~common_speculative_impl_draft_eagle3() override {
+        auto * ctx_dft = this->params.ctx_dft;
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
+            if (backend_chains[seq_id] == nullptr) {
+                continue;
+            }
+            if (ctx_dft) {
+                llama_set_sampler(ctx_dft, seq_id, nullptr);
+            }
+            llama_sampler_free(backend_chains[seq_id]);
+        }
+        backend_chains.clear();
+
        if (batch.token != nullptr) {
            free(batch.token);
            batch.token = nullptr;
@@ -2059,6 +2092,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u

    {
        common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
+
+        if (impl->n_acc_tokens_per_pos.size() < n_accepted) {
+            impl->n_acc_tokens_per_pos.resize(n_accepted, 0);
+        }
+
+        for (size_t i = 0; i < n_accepted; ++i) {
+            impl->n_acc_tokens_per_pos[i]++;
+        }
+
        if (n_accepted > 0) {
            impl->n_acc_drafts++;
            impl->n_acc_tokens += n_accepted;
@@ -2093,13 +2135,31 @@ void common_speculative_print_stats(const common_speculative * spec) {
            str_perf = "";
        }

-        LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
+        std::string str_stats;
+        if (impl->n_call_accept > 0) {
+            const double mean =
+                1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept;
+            std::ostringstream tmp;
+            tmp << std::fixed << std::setprecision(3);
+            for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) {
+                if (i > 0) {
+                    tmp << ", ";
+                }
+                tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept;
+            }
+            std::ostringstream oss;
+            oss << std::fixed << std::setprecision(2) << mean;
+            str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
+        }
+
+        LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
                common_speculative_type_to_str(impl->type).c_str(),
                impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
                impl->n_gen_drafts,
                impl->n_acc_drafts,
                impl->n_gen_tokens,
                impl->n_acc_tokens,
+                str_stats.c_str(),
                str_perf.c_str());
    }
 }
@@ -25,7 +25,7 @@ import gguf
 from gguf.constants import GGUFValueType

 # reuse model definitions from the conversion/ package
-from conversion import LazyTorchTensor, ModelBase, get_model_class
+from conversion import LazyTorchTensor, ModelBase, get_model_class, ModelType, get_model_architecture

 logger = logging.getLogger("lora-to-gguf")

@@ -396,12 +396,12 @@ if __name__ == '__main__':
        hparams = ModelBase.load_hparams(dir_base_model, False)

    with torch.inference_mode():
+        model_arch = get_model_architecture(hparams, ModelType.TEXT)
        try:
-            model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0]
-            logger.info("Using model architecture: %s", model_arch)
            model_class = get_model_class(model_arch)
+            logger.info("Using model architecture: %s", model_arch)
        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            logger.error(f"Model {model_arch} is not supported")
            sys.exit(1)

        class LoraModel(model_class):  # ty: ignore[unsupported-base]
@@ -270,7 +270,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t

 ---

-**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox.
+**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with the toolbox.

 **Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide.

@@ -12,6 +12,25 @@ The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a t
 - Compiles and caches the model for the target device.
 - Binds GGML tensor memory to OpenVINO inference tensors and runs inference.

+## Contents
+
+- [Supported Devices](#supported-devices)
+- [Supported Model Precisions](#supported-model-precisions)
+- [Supported Llama.cpp Tools](#supported-llamacpp-tools)
+- [Validated Models](#validated-models)
+- [Build Instructions](#build-instructions)
+  - [0. Prerequisites](#0-prerequisites)
+  - [1. Install OpenVINO Runtime](#1-install-openvino-runtime)
+  - [2. Build llama.cpp with OpenVINO Backend](#2-build-llamacpp-with-openvino-backend)
+    - [Automated Ubuntu Build Script](#automated-ubuntu-build-script)
+    - [Automated Windows Build Script](#automated-windows-build-script)
+  - [3. Download Sample Model](#3-download-sample-model)
+  - [4. Run Inference with OpenVINO Backend](#4-run-inference-with-openvino-backend)
+  - [5. Docker Build](#5-docker-build)
+- [GGML OpenVINO Backend Runtime Configurations](#ggml-openvino-backend-runtime-configurations)
+- [Known Limitations](#known-limitations)
+- [Work in Progress](#work-in-progress)
+
 ## Supported Devices

 OpenVINO backend supports the following hardware:
@@ -31,55 +50,102 @@ Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvin
 - `Q4_1`
 - `Q4_K`
 - `Q4_K_M`
- `Q5_K` (converted to Q8_0_C at runtime)
- `Q6_K` (converted to Q8_0_C at runtime)
+- `Q5_K` (converted to `Q8_0_C` at runtime)
+- `Q6_K` (converted to `Q8_0_C` at runtime)

 > [!NOTE]
 > Accuracy validation and performance optimizations for quantized models are a work in progress.

-## Quantization Support Details
-
-### CPU and GPU
-
- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported**
+**CPU and GPU Quantization Details:**
 - `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C`

-### NPU
-
- **Primary supported quantization scheme is `Q4_0`**
+**NPU Quantization Details:**
+- Primary supported quantization scheme is `Q4_0`
 - `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16

-### Additional Notes
-
+**Additional Notes:**
 - Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor)
 - `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize`
 - `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3)
+- `Q5_1` tensors are dequantized natively (weights, scales, and zero-points extracted directly)
+
+## Supported Llama.cpp Tools
+
+The OpenVINO backend integrates with the standard llama.cpp tools listed below.
+However, all the tools coverage across all devices is not uniform and exhaustive validation is work in progress.
+
+- llama-bench
+- llama-cli
+- llama-completion
+- llama-embedding
+- llama-perplexity
+- llama-run
+- llama-server
+- llama-simple

 ## Validated Models

-The following models were validated on Intel® Core™ Ultra Series 2. While our testing was limited, the OpenVINO backend is expected to work across a broad range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html).
- Use `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
- Additional model support, quantization formats and validations are work in progress.
+Although, the validated models below were tested with `llama-cli` using the `Q4_K_M` quantization format on Intel® Core™ Ultra Series 2 (Lunar Lake), the OpenVINO backend is expected to work across a broader range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html), [supported model precisions](#supported-model-precisions), [supported llama.cpp tools](#supported-llamacpp-tools) and additional model architectures.

-| Model  | Validated   | Known Issues  |
-| :------| :---------- | :-------------|
-| [Llama-3.2-1B-Instruct](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
-| [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | `Q8_0`, `Q4_K_M` on CPU/GPU/NPU | `Q4_0_8_8`, `Q4_0_4_8`, `Q4_0_4_4` fail |
-| [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | `FP16`, `Q4` on CPU/NPU | GPU unsupported for `FP16` and `Q4` (`llama-cli`, `llama-bench`) |
-| [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
-| [Qwen3-8B-Instruct](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/NPU; GPU works via `llama-bench` | GPU `llama-cli` unsupported for all quantizations |
-| [MiniCPM-V-2_6-GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `Q4_0` on CPU/GPU/NPU | — |
-| [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
-| [Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | CPU: `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M`; GPU: `Q8_0`, `Q4_0`, `Q4_1`; NPU (`llama-bench` only): `Q4_0`, `Q4_1`, `Q4_K_M` | GPU `Q4_K_M` unsupported; NPU `llama-cli` unsupported |
-| [Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF/) | CPU/GPU: `Q8_0`, `Q4_K_M`; NPU: `Q8_0`, `Q4_K_M` (via `llama-bench`) | NPU `llama-cli` unsupported for `Q8_0`, `Q4_K_M` |
+> [!NOTE]
+> Extensive accuracy validation, performance optimizations, and broader architecture coverage are work in progress.
+
+**Legend & Test Configuration:**
+- **Status:** ✓ = Passed | ✗ = Failed or Unsupported
+- **Execution Modes:**
+  - **SL** = Stateless (`GGML_OPENVINO_STATEFUL_EXECUTION=0`)
+  - **SF** = Stateful (`GGML_OPENVINO_STATEFUL_EXECUTION=1`)
+  - Note: The NPU operates in stateless mode only.
+- **Validation system:** Intel® Core™ Ultra 5 238V (Lunar Lake) | 32 GB RAM | Ubuntu 24.04 | Intel OpenCL GPU Driver 26.18.38308.1 | Intel NPU Driver 1.33.0.
+- See [Known Limitations](#known-limitations) for context on observed failures.
+
+| Model | CPU (SL / SF) | GPU (SL / SF) | NPU (SL) |
+| :--- | :---: | :---: | :---: |
+| [bartowski/Llama-3.2-1B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [bartowski/Llama-3.2-3B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [bartowski/Meta-Llama-3.1-8B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+|  |  |  |  |
+| [Qwen/qwen2.5-1.5b-instruct-q4_k_m](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [Qwen/qwen2.5-coder-7b-instruct-q4_k_m](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/Qwen_Qwen3-0.6B-Q4_K_M](https://huggingface.co/bartowski/Qwen_Qwen3-0.6B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/Qwen_Qwen3-1.7B-Q4_K_M](https://huggingface.co/bartowski/Qwen_Qwen3-1.7B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [Qwen/Qwen3-4B-Q4_K_M](https://huggingface.co/Qwen/Qwen3-4B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [lm-kit/Qwen3-8B-Q4_K_M](https://huggingface.co/lm-kit/qwen-3-8b-instruct-gguf) | ✓ / ✓ | ✓ / ✗ | ✓ |
+|  |  |  |  |
+| [unsloth/gemma-3-4b-it-Q4_K_M](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/google_gemma-4-E2B-it-Q4_K_M](https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✓ |
+| [bartowski/google_gemma-4-E4B-it-Q4_K_M](https://huggingface.co/bartowski/google_gemma-4-E4B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✓ |
+| [bartowski/gemma-4-12B-it-Q4_K_M](https://huggingface.co/bartowski/gemma-4-12B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✗ |
+|  |  |  |  |
+| [bartowski/Phi-3-mini-4k-instruct-Q4_K_M](https://huggingface.co/bartowski/Phi-3-mini-4k-instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/Phi-3.5-mini-instruct-Q4_K_M](https://huggingface.co/bartowski/Phi-3.5-mini-instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+|  |  |  |  |
+| [bartowski/Mistral-7B-Instruct-v0.3-Q4_K_M](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [QuantFactory/Ministral-3b-instruct.Q4_K_M](https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [bartowski/Ministral-8B-Instruct-2410-Q4_K_M](https://huggingface.co/bartowski/Ministral-8B-Instruct-2410-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+|  |  |  |  |
+| [bartowski/DeepSeek-R1-Distill-Llama-8B-Q4_K_M](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [bartowski/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+|  |  |  |  |
+| [ibm-granite/granite-4.0-350m-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-350m-GGUF) | ✓ / ✓ | ✗ / ✗ | ✓ |
+| [ibm-granite/granite-4.0-micro-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-micro-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [ibm-granite/granite-4.0-1b-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-1b-GGUF) | ✓ / ✓ | ✗ / ✗ | ✗ |
+| [ibm-research/granite-3.2-8b-instruct-Q4_K_M](https://huggingface.co/ibm-research/granite-3.2-8b-instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+|  |  |  |  |
+| [HuggingFaceTB/smollm2-1.7b-instruct-q4_k_m](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [openbmb/MiniCPM-V-2_6-Q4_K_M](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/tencent_Hunyuan-7B-Instruct-Q4_K_M](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-Q4_K_M](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/prism-ml_Bonsai-8B-unpacked-Q4_K_M](https://huggingface.co/bartowski/prism-ml_Bonsai-8B-unpacked-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+|  |  |  |  |
+| [gpustack/bge-m3-Q4_K_M.gguf](https://huggingface.co/gpustack/bge-m3-GGUF) | ✓ | ✗ | ✗ |

 ## Build Instructions

-### Prerequisites
+### 0. Prerequisites

 - Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
+- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2026/get-started/install-openvino/configurations.html).

 - **Linux:**
    - Git, CMake, and Ninja software tools are needed for building.
@@ -119,28 +185,14 @@ The following models were validated on Intel® Core™ Ultra Series 2. While our

 - Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html)

- **Linux:**
-
-    <details>
-    <summary>📦 Click to expand OpenVINO installation from an archive file on Ubuntu</summary>
-    <br>
-
-    ```bash
-    wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
-    chmod +x install-openvino-from-archive.sh
-    ./install-openvino-from-archive.sh
-    ```
-
-    Verify OpenVINO is initialized properly:
-    ```bash
-    echo $OpenVINO_DIR
-    ```
-    </details>
-
+- Verify OpenVINO is initialized properly:
+  ```bash
+  echo $OpenVINO_DIR
+  ```

 ### 2. Build llama.cpp with OpenVINO Backend

-Clone the OpenVINO-enabled llama.cpp fork and build it:
+Clone llama.cpp repo and build :

 ```bash
 git clone https://github.com/ggml-org/llama.cpp
@@ -148,39 +200,375 @@ cd llama.cpp
 ```

 - **Linux:**
-    ```bash
-    source /opt/intel/openvino/setupvars.sh
-    cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
-    cmake --build build/ReleaseOV --parallel
-    ```
+```bash
+source /opt/intel/openvino/setupvars.sh
+cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
+cmake --build build/ReleaseOV --parallel
+```
+
+- **Windows:** Open a **Developer Command Prompt for VS 2022** (so the MSVC toolchain is on `PATH`), then run:
+
+```cmd
+C:\Intel\openvino\setupvars.bat
+cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+cmake --build build\ReleaseOV --parallel
+```

- **Windows:**
-    ```cmd
-    # x64 Native Tools Command Prompt for VS 2022
-    "C:\Program Files (x86)\Intel\openvino_2026.0\setupvars.bat"
-    cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
-    cmake --build build\ReleaseOV --parallel
-    ```
 > [!NOTE]
-> Use `x64 Native Tools Command Prompt` for Windows build. After building, you could use either `cmd` or `PowerShell` to run the OpenVINO backend.
+> The Windows install path is `C:\Intel\openvino` (no spaces) to avoid quoting problems some CMake/Ninja toolchains have with `C:\Program Files (x86)\...`. Adjust to wherever you installed OpenVINO Runtime. From `cmd`, run `C:\Intel\openvino\setupvars.bat`; from PowerShell, run `& "C:\Intel\openvino\setupvars.ps1"` instead. Once the build is finished you can launch the binaries from any `cmd` or `PowerShell` window after sourcing the matching `setupvars` script for that shell.
+
+#### Automated Ubuntu Build Script
+
+For Ubuntu24 users, the following shell script automates the prerequisite installs (build tools, OpenCL ICD), the OpenVINO Runtime download/extract/setup, and the Ninja-based llama.cpp build.
+Save the following as `ubuntu-llamacpp-ov-install.sh` next to where you want the `llama.cpp` folder to land, then run it:
+
+```bash
+chmod +x ubuntu-llamacpp-ov-install.sh
+./ubuntu-llamacpp-ov-install.sh
+```
+
+<details>
+<summary>Click to expand <code>ubuntu-llamacpp-ov-install.sh</code></summary>
+
+```bash
+#!/usr/bin/env bash
+# ============================================
+# llama.cpp OpenVINO Build Script (Ninja)
+# ============================================
+set -euo pipefail
+
+OPENVINO_VERSION_MAJOR="2026.2"
+OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}"
+OPENVINO_LINK_DIR="/opt/intel/openvino"
+OPENVINO_TGZ="${SCRIPT_DIR}/openvino.tgz"
+OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz"
+
+echo "============================================"
+echo "Installing prerequisites (apt)..."
+echo "============================================"
+sudo apt-get update
+sudo apt-get install -y \
+    build-essential libcurl4-openssl-dev libtbb12 \
+    cmake ninja-build python3-pip \
+    curl wget tar git
+
+echo "============================================"
+echo "Installing OpenCL runtime + headers..."
+echo "============================================"
+sudo apt-get install -y \
+    ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+
+cd "${SCRIPT_DIR}"
+
+# ============================================
+# Clone llama.cpp if missing
+# ============================================
+if [[ ! -f "llama.cpp/CMakeLists.txt" ]]; then
+    echo "Cloning llama.cpp..."
+    git clone https://github.com/ggml-org/llama.cpp
+fi
+
+# ============================================
+# Setup OpenVINO: download & extract to /opt/intel/openvino_${OPENVINO_VERSION_MAJOR},
+# then point /opt/intel/openvino at it via symlink so the active version is swappable.
+# ============================================
+if [[ -f "${OPENVINO_INSTALL_DIR}/setupvars.sh" ]]; then
+    echo "OpenVINO ${OPENVINO_VERSION_MAJOR} already installed at ${OPENVINO_INSTALL_DIR}. Skipping download."
+else
+    echo "OpenVINO not found at ${OPENVINO_INSTALL_DIR}. Starting download..."
+    curl -L -o "${OPENVINO_TGZ}" "${OPENVINO_URL}"
+
+    echo "Extracting OpenVINO to ${OPENVINO_INSTALL_DIR}..."
+    sudo mkdir -p "${OPENVINO_INSTALL_DIR}"
+    sudo tar -xzf "${OPENVINO_TGZ}" -C "${OPENVINO_INSTALL_DIR}" --strip-components=1
+    rm -f "${OPENVINO_TGZ}"
+fi
+
+# Refresh symlink: /opt/intel/openvino -> /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+sudo ln -sfn "${OPENVINO_INSTALL_DIR}" "${OPENVINO_LINK_DIR}"
+
+OPENVINO_ROOT="${OPENVINO_LINK_DIR}"
+echo "OpenVINO Ready: ${OPENVINO_ROOT} -> ${OPENVINO_INSTALL_DIR}"
+
+# Install OpenVINO's own runtime dependencies (one-time per system).
+if [[ -x "${OPENVINO_ROOT}/install_dependencies/install_openvino_dependencies.sh" ]]; then
+    echo "============================================"
+    echo "Installing OpenVINO runtime dependencies..."
+    echo "============================================"
+    echo "Y" | sudo -E "${OPENVINO_ROOT}/install_dependencies/install_openvino_dependencies.sh"
+fi
+
+# ============================================
+# Clean old build cache
+# ============================================
+cd "${SCRIPT_DIR}/llama.cpp"
+if [[ -d "build/ReleaseOV" ]]; then
+    echo "Removing old build directory..."
+    rm -rf "build/ReleaseOV"
+fi
+
+echo "============================================"
+echo "Configuring with CMake..."
+echo "============================================"
+# shellcheck disable=SC1091
+source "${OPENVINO_ROOT}/setupvars.sh"
+
+cmake -B build/ReleaseOV -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGGML_OPENVINO=ON
+
+cmake --build build/ReleaseOV --parallel
+
+echo "============================================"
+echo "Build completed successfully!"
+echo "============================================"
+echo "Binaries: $(pwd)/build/ReleaseOV/bin"
+echo
+echo "NOTE: To run, source setupvars.sh and pick a device:"
+echo "  source /opt/intel/openvino/setupvars.sh"
+echo "  export GGML_OPENVINO_DEVICE=CPU   # or GPU / NPU"
+echo "  ./build/ReleaseOV/bin/llama-cli -m model.gguf"
+```
+
+> [!NOTE]
+> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
+
+</details>
+
+#### Automated Windows Build Script
+
+For Windows users, the following `.bat` script automates the prerequisite installs (Git, Ninja, CMake, Visual Studio 2022 Build Tools, vcpkg + OpenCL), the OpenVINO Runtime download/extract, and the Ninja-based llama.cpp build.
+Save the following as `windows-llamacpp-ov-install.bat` next to where you want the `llama.cpp` to land, then run it from either **Command Prompt** or **PowerShell**:
+
+```cmd
+:: Command Prompt
+windows-llamacpp-ov-install.bat
+```
+
+```powershell
+# PowerShell
+.\windows-llamacpp-ov-install.bat
+```
+
+<details>
+<summary>Click to expand <code>windows-llamacpp-ov-install.bat</code></summary>
+
+```bat
+@echo off
+setlocal enabledelayedexpansion
+
+REM ============================================
+REM llama.cpp OpenVINO Build Script (Ninja)
+REM ============================================
+
+set "OPENVINO_VERSION_MAJOR=2026.2"
+set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857"
+
+set "SCRIPT_DIR=%~dp0"
+set "VCPKG_DIR=C:\vcpkg"
+set "OPENVINO_INSTALL_DIR=C:\Intel\openvino_%OPENVINO_VERSION_MAJOR%"
+set "OPENVINO_LINK_DIR=C:\Intel\openvino"
+set "OPENVINO_ZIP=%SCRIPT_DIR%openvino.zip"
+set "OPENVINO_EXTRACT_TMP=%SCRIPT_DIR%openvino_extract_tmp"
+set "OPENVINO_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/%OPENVINO_VERSION_MAJOR%/windows/openvino_toolkit_windows_%OPENVINO_VERSION_FULL%_x86_64.zip"
+
+echo ============================================
+echo Installing prerequisites...
+echo ============================================
+winget install --id Git.Git -e --accept-source-agreements --accept-package-agreements 2>nul
+winget install --id Ninja-build.Ninja -e --accept-source-agreements --accept-package-agreements 2>nul
+winget install --id Kitware.CMake -e --accept-source-agreements --accept-package-agreements 2>nul
+
+REM Ensure Visual Studio Build Tools are installed.
+echo Checking for Visual Studio Build Tools...
+set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+set "VS_INSTALLED="
+if exist "%VSWHERE%" (
+    for /f "usebackq tokens=*" %%i in (`"%VSWHERE%" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath 2^>nul`) do (
+        set "VS_INSTALLED=%%i"
+    )
+)
+if defined VS_INSTALLED (
+    echo Visual Studio with VC++ x86/x64 tools already present at "!VS_INSTALLED!". Skipping winget install.
+) else (
+    winget install --id Microsoft.VisualStudio.2022.BuildTools -e --override "--wait --passive --add Microsoft.VisualStudio.Workload.VCTools --includeRecommended" --accept-source-agreements --accept-package-agreements
+    if errorlevel 1 (
+        echo WARNING: winget could not install Visual Studio Build Tools automatically.
+        echo Install manually from https://aka.ms/vs/17/release/vs_BuildTools.exe ^(select the "Desktop development with C++" workload^)
+        echo and re-run this script from a "Developer Command Prompt for VS 2022".
+    )
+)
+
+echo ============================================
+echo Installing OpenCL via vcpkg...
+echo ============================================
+if not exist "%VCPKG_DIR%" (
+    git clone https://github.com/microsoft/vcpkg "%VCPKG_DIR%"
+    cd /d "%VCPKG_DIR%"
+    call bootstrap-vcpkg.bat
+    call vcpkg integrate install
+)
+cd /d "%VCPKG_DIR%"
+call vcpkg install opencl
+
+cd /d "%SCRIPT_DIR%"
+
+REM ============================================
+REM Clone llama.cpp if missing
+REM ============================================
+if not exist "llama.cpp\CMakeLists.txt" (
+    echo Cloning llama.cpp...
+    git clone https://github.com/ggml-org/llama.cpp
+)
+
+cd /d "llama.cpp"
+set "SCRIPT_DIR=%CD%"
+
+REM ============================================
+REM Setup OpenVINO: download & extract to C:\Intel\openvino_%OPENVINO_VERSION_MAJOR%,
+REM then point C:\Intel\openvino at it via a directory junction (mklink /J).
+REM ============================================
+
+if exist "%OPENVINO_INSTALL_DIR%\setupvars.bat" (
+    echo OpenVINO %OPENVINO_VERSION_MAJOR% already installed at "%OPENVINO_INSTALL_DIR%". Skipping download.
+) else (
+    echo OpenVINO not found at "%OPENVINO_INSTALL_DIR%". Starting download...
+
+    curl -L -o "%OPENVINO_ZIP%" "%OPENVINO_URL%"
+    if errorlevel 1 (
+        echo ERROR: Download failed.
+        exit /b 1
+    )
+
+    echo Extracting OpenVINO...
+    if exist "%OPENVINO_EXTRACT_TMP%" rmdir /s /q "%OPENVINO_EXTRACT_TMP%"
+    mkdir "%OPENVINO_EXTRACT_TMP%"
+    tar -xf "%OPENVINO_ZIP%" -C "%OPENVINO_EXTRACT_TMP%"
+    if errorlevel 1 (
+        echo ERROR: Extraction failed.
+        exit /b 1
+    )
+
+    REM Move the single top-level folder contents into the versioned install dir.
+    REM NOTE: delayed expansion (!VAR!) is required because the surrounding else( ... )
+    REM block is parsed once up-front, so %OPENVINO_EXTRACTED% would expand to "" here
+    REM and xcopy would then treat "\*" as C:\* and fail with "Cannot perform a cyclic copy".
+    set "OPENVINO_EXTRACTED="
+    for /d %%i in ("%OPENVINO_EXTRACT_TMP%\*") do set "OPENVINO_EXTRACTED=%%i"
+    if not defined OPENVINO_EXTRACTED (
+        echo ERROR: Could not locate extracted OpenVINO folder under "%OPENVINO_EXTRACT_TMP%".
+        exit /b 1
+    )
+    if not exist "%OPENVINO_INSTALL_DIR%" mkdir "%OPENVINO_INSTALL_DIR%"
+    xcopy /e /i /y /q "!OPENVINO_EXTRACTED!\*" "%OPENVINO_INSTALL_DIR%\" >nul
+    if errorlevel 1 (
+        echo ERROR: Failed to copy OpenVINO from "!OPENVINO_EXTRACTED!" to "%OPENVINO_INSTALL_DIR%".
+        echo Re-run this script from an elevated Command Prompt ^(Run as administrator^) if access is denied.
+        exit /b 1
+    )
+
+    rmdir /s /q "%OPENVINO_EXTRACT_TMP%"
+    del "%OPENVINO_ZIP%"
+)
+
+REM Refresh junction: C:\Intel\openvino -> C:\Intel\openvino_<version>.
+REM `mklink /J` creates a directory junction (no admin / Developer Mode required).
+if exist "%OPENVINO_LINK_DIR%" rmdir "%OPENVINO_LINK_DIR%"
+mklink /J "%OPENVINO_LINK_DIR%" "%OPENVINO_INSTALL_DIR%" >nul
+if errorlevel 1 (
+    echo ERROR: Failed to create junction "%OPENVINO_LINK_DIR%" -^> "%OPENVINO_INSTALL_DIR%".
+    echo If "%OPENVINO_LINK_DIR%" already exists as a regular non-empty folder, remove it manually and re-run.
+    exit /b 1
+)
+
+set "OPENVINO_ROOT=%OPENVINO_LINK_DIR%"
+echo OpenVINO Ready: %OPENVINO_ROOT% -^> %OPENVINO_INSTALL_DIR%
+
+
+echo ============================================
+echo Setting up compiler environment...
+echo ============================================
+REM Locate Visual Studio Build Tools vcvars64.bat
+set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+if exist "%VSWHERE%" (
+    for /f "usebackq tokens=*" %%i in (`"%VSWHERE%" -latest -products Microsoft.VisualStudio.Product.BuildTools -property installationPath`) do (
+        set "VS_PATH=%%i"
+    )
+)
+if defined VS_PATH (
+    call "%VS_PATH%\VC\Auxiliary\Build\vcvars64.bat" >nul
+) else (
+    echo WARNING: Visual Studio Build Tools not found. Compiler may be missing.
+)
+
+REM ============================================
+REM Clean old build cache
+REM ============================================
+if exist "build\ReleaseOV" (
+    echo Removing old build directory ...
+    rmdir /s /q "build\ReleaseOV"
+)
+
+echo ============================================
+echo Configuring with CMake...
+echo ============================================
+call "%OPENVINO_ROOT%\setupvars.bat" >nul 2>nul
+
+cmake -B build\ReleaseOV -G Ninja ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DGGML_OPENVINO=ON ^
+    -DCMAKE_TOOLCHAIN_FILE="%VCPKG_DIR%\scripts\buildsystems\vcpkg.cmake"
+
+if errorlevel 1 (
+    echo If you continue to face CMAKE errors, make sure to install:
+    echo   winget install Microsoft.VisualStudio.2022.BuildTools
+    echo   Then run the "Developer Command Prompt for VS 2022" and launch this script from there.
+    exit /b 1
+)
+
+cmake --build build\ReleaseOV --config Release
+if errorlevel 1 exit /b 1
+
+echo ============================================
+echo Build completed successfully!
+echo ============================================
+echo Binaries: %CD%\build\ReleaseOV\bin
+echo.
+echo NOTE: To run, source setupvars.bat and pick a device:
+echo   call "C:\Intel\openvino\setupvars.bat"
+echo   set GGML_OPENVINO_DEVICE=CPU   ^&^& REM or GPU / NPU
+echo   build\ReleaseOV\bin\llama-cli.exe -m model.gguf
+echo.
+
+endlocal
+```
+
+> [!NOTE]
+> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
+
+</details>
+

 ### 3. Download Sample Model

-Download models for testing:
+Download sample model for testing.

 ```bash
 # Linux
 mkdir -p ~/models/
-wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
-     -O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
+wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf \
+     -O ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf

 # Windows PowerShell
 mkdir C:\models
-Invoke-WebRequest -Uri https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
+Invoke-WebRequest -Uri https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf

 # Windows Command Line
 mkdir C:\models
-curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
+curl -L https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf
 ```

 ### 4. Run Inference with OpenVINO Backend
@@ -196,65 +584,45 @@ When using the OpenVINO backend, the first inference token may have slightly hig

 # Linux
 export GGML_OPENVINO_DEVICE=GPU
-# Enable stateful execution with GPU device to avoid known stateless execution failures.
+# Optional: enable stateful execution for improved GPU performance (recommended).
 export GGML_OPENVINO_STATEFUL_EXECUTION=1
 # To run llama-simple:
-./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -n 50 "The story of AI is "
 # To run in chat mode:
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 1024
 # To run llama-bench, -fa 1 is needed
-GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -fa 1
+GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -fa 1

 # NPU: keep context small to avoid failures from very large model context windows.
 export GGML_OPENVINO_DEVICE=NPU
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 512
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 512

 # Windows Command Line
 set GGML_OPENVINO_DEVICE=GPU
-# Enable stateful execution with GPU device to avoid known stateless execution failures.
+# Optional: enable stateful execution for improved GPU performance (recommended).
 set GGML_OPENVINO_STATEFUL_EXECUTION=1
 # Windows PowerShell
 $env:GGML_OPENVINO_DEVICE = "GPU"
 $env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"

 # To run llama-simple
-build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
+build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -n 50 "The story of AI is "
 # To run in chat mode:
-build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 1024
+build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -c 1024
 # To run llama-bench, -fa 1 is needed
-build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -fa 1
+build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -fa 1

 # NPU: keep context small to avoid failures from very large model context windows.
 # Windows Command Line
 set GGML_OPENVINO_DEVICE=NPU
 # Windows PowerShell
 $env:GGML_OPENVINO_DEVICE = "NPU"
-build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 512
+build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -c 512
 ```
 > [!NOTE]
 > On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.

-### Known Issues and Current Workarounds
-
- GPU stateless execution is currently affected by a known issue.
-  - Workaround: set `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
- NPU failures can happen when context size is too large. Recent llama.cpp behavior may resolve context size to the model training context (for example, 131072 for Llama 3.2 1B), which is too large for current NPU usage and can also stress laptop CPU/GPU on larger models. To inspect the selected context size, run `llama-cli` or `llama-server` with `-lv 3`.
-  - Workaround: explicitly set context size, for ex. `-c 1024` for NPU runs. Performance will be better with lower context size.
- Additional NPU limitations:
-  - Model caching is not yet supported.
-  - `llama-server -np > 1` (multiple parallel sequences) is not supported.
-  - `llama-perplexity` is only supported with `-b 512` or smaller.
- `--context-shift` with `llama-cli` is currently not supported with OpenVINO backend across CPU, GPU, and NPU devices.
- Encoder models (embedding, reranking) are not supported with the current OpenVINO backend implementation.
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
-  - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
-
-> [!NOTE]
-> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
-
-
-### Docker Build
+### 5. Docker Build

 You can build and run llama.cpp with OpenVINO backend using Docker.

@@ -272,7 +640,7 @@ docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfi
 docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .

 # If you are behind a proxy:
-docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
+docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
 ```

 Run llama.cpp with OpenVINO backend Docker container.
@@ -281,19 +649,19 @@ Save sample models in `~/models` as [shown above](#3-download-sample-model). It

 ```bash
 #  Run Docker container
-docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf

 # With Intel GPU access (iGPU or dGPU)
 docker run --rm -it -v ~/models:/models \
 --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
 --env=GGML_OPENVINO_DEVICE=GPU --env=GGML_OPENVINO_STATEFUL_EXECUTION=1 \
-llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf

 # With Intel NPU access
 docker run --rm -it -v ~/models:/models \
 --device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
 --env=GGML_OPENVINO_DEVICE=NPU \
-llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
 ```

 Run Llama.cpp Server with OpenVINO Backend.
@@ -301,17 +669,30 @@ Run Llama.cpp Server with OpenVINO Backend.
 > `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.

 ```bash
-# Run the Server Docker container
-docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
-# Or Using llama-server executable
-./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf --port 8080 -c 1024
+# Run the llama-openvino:server Docker container (CPU)
+docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 1024 --host 0.0.0.0

-# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
-export NO_PROXY=localhost,127.0.0.1
+# Run the llama-openvino:server Docker container with Intel GPU access (iGPU or dGPU)
+docker run --rm -it -v ~/models:/models \
+--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+-p 8080:8080 --env=GGML_OPENVINO_DEVICE=GPU  \
+llama-openvino:server --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --host 0.0.0.0
+
+# Run the llama-openvino:server Docker container with Intel NPU access
+docker run --rm -it -v ~/models:/models \
+--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+-p 8080:8080 --env=GGML_OPENVINO_DEVICE=NPU \
+llama-openvino:server --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --host 0.0.0.0
+
+# Or Using llama-server executable
+./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --port 8080 -c 1024

 # Option 1: Open your browser to http://localhost:8080 to access the web UI for the llama.cpp server.
 # Option 2: In a NEW terminal, test the server with curl

+# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
+export NO_PROXY=localhost,127.0.0.1
+
 # Test health endpoint
 curl -f http://localhost:8080/health

@@ -320,24 +701,26 @@ curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: appli
 -d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq .
 ```

-## Runtime Configuration
+## GGML OpenVINO Backend Runtime Configurations

 The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior.
+Boolean flags follow a uniform convention: set to a **positive integer** (e.g. `1`) to enable; unset, empty, `0`, negative, or non-numeric values are treated as disabled.

-### Configuration Options
-
-| Variable                          | Default    | Description                                                                                                 |
-|-----------------------------------|------------|-------------------------------------------------------------------------------------------------------------|
-| `GGML_OPENVINO_DEVICE`            | `CPU`      | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
-| `GGML_OPENVINO_CACHE_DIR`         | `not set`  | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
-| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| `256`      | Token chunk size for **NPU** prefill.                                                                       |
-| `GGML_OPENVINO_STATEFUL_EXECUTION`| `0`        | Enable stateful KV cache on for better performance. Recommended on CPU, GPU.                                |
-| `GGML_OPENVINO_PROFILING`         | `0`        | Enable execution-time profiling.                                                                            |
-| `GGML_OPENVINO_DUMP_CGRAPH`       | `0`        | Dump the GGML compute graph to `cgraph_ov.txt`.                                                             |
-| `GGML_OPENVINO_DUMP_IR`           | `0`        | Serialize OpenVINO IR files with timestamps.                                                                |
-| `GGML_OPENVINO_DEBUG_INPUT`       | `0`        | Enable input debugging and print input tensor info.                                                         |
-| `GGML_OPENVINO_DEBUG_OUTPUT`      | `0`        | Enable output debugging and print output tensor info.                                                       |
-| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | `0` | Print tensor address map once.                                                                           |
+| Variable                          | Type      | Default    | Description                                                                                                 |
+|-----------------------------------|-----------|------------|-------------------------------------------------------------------------------------------------------------|
+| `GGML_OPENVINO_DEVICE`            | String    | `CPU`      | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
+| `GGML_OPENVINO_CACHE_DIR`         | String    | `not set`  | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
+| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| Integer   | `256`      | Token chunk size for **NPU** prefill (NPU-only; ignored on CPU/GPU). Must be a positive integer; otherwise the default is used. |
+| `GGML_OPENVINO_STATEFUL_EXECUTION`| Boolean   | `0`        | Enable stateful KV cache for better performance. Recommended on CPU, GPU.                                   |
+| `GGML_OPENVINO_DISABLE_CACHE`     | Boolean   | `0`        | Disable the in-process compiled-model / decoder cache (cache is on by default). Set to `1` to disable.      |
+| `GGML_OPENVINO_DISABLE_KV_SLICE`  | Boolean   | `0`        | Disable the KV-cache input-tensor slicing optimization (slicing is on by default on CPU/GPU). Set to `1` to disable. |
+| `GGML_OPENVINO_MANUAL_GQA_ATTN`   | Boolean   | device-based | Tri-state. When **unset**, manual GQA attention is enabled by default on `GPU` and disabled on other devices. Set to a positive integer to force-enable, or `0` to force-disable. |
+| `GGML_OPENVINO_PROFILING`         | Boolean   | `0`        | Enable execution-time profiling.                                                                            |
+| `GGML_OPENVINO_DUMP_CGRAPH`       | Boolean   | `0`        | Dump the GGML compute graph to `cgraph_ov.txt`.                                                             |
+| `GGML_OPENVINO_DUMP_IR`           | Boolean   | `0`        | Serialize OpenVINO IR files with timestamps.                                                                |
+| `GGML_OPENVINO_DEBUG_INPUT`       | Boolean   | `0`        | Enable input debugging and print input tensor info.                                                         |
+| `GGML_OPENVINO_DEBUG_OUTPUT`      | Boolean   | `0`        | Enable output debugging and print output tensor info.                                                       |
+| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | Boolean | `0` | Print tensor address map once.                                                                           |

 > [!NOTE]
 >`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported.
@@ -355,7 +738,7 @@ export GGML_OPENVINO_PROFILING=1
 export GGML_OPENVINO_DEVICE=GPU
 export GGML_OPENVINO_STATEFUL_EXECUTION=1

-./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -n 50 "The story of AI is "

 # Windows Command Line
 set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
@@ -369,19 +752,39 @@ $env:GGML_OPENVINO_PROFILING = "1"
 $env:GGML_OPENVINO_DEVICE = "GPU"
 $env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"

-build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
+build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -n 50 "The story of AI is "

 ```

-## Llama.cpp Tools
+## Known Limitations

-The following tools work with the OpenVINO backend on CPU, GPU, NPU:
- llama-bench
- llama-cli
- llama-completion
- llama-perplexity
- llama-server
- llama-simple
+**General (all devices)**
+
+- Llama.cpp OpenVINO backend currently supports a subset of GGML ops and text-only models. Unsupported ops or unsupported op shapes/cases fail during OpenVINO translation.
+- Multimodal features (audio/image/video) are a work in progress.
+- Limited Embedding and Reranking model support.
+- Llama.cpp tool coverage across CPU/GPU/NPU is not uniform.
+
+**Tool-specific**
+
+- `llama-bench`: requires `-fa 1` (flash-attention).
+- `llama-cli --context-shift`: stateless only (`GGML_OPENVINO_STATEFUL_EXECUTION=0`). In stateful mode the KV cache is owned by the OpenVINO model and cannot be shifted externally.
+- `llama-server`: only one chat session/thread when `GGML_OPENVINO_STATEFUL_EXECUTION=1`.
+
+**GPU-specific**
+
+- `llama-server -np > 1`: concurrent requests are batched together, which may slightly reduce per-request throughput.
+
+**NPU-specific**
+
+- Default context resolves to the model's training context (e.g. 131072 for Llama 3.2 1B), which can OOM or fail or degrade performance on NPU. Inspect the resolved value with `-lv 3`.
+  - **Workaround:** Pass an explicit `-c <N>`, e.g. `-c 1024`.
+- NPU device uses a static graph with a fixed prefill chunk size (defaults to 256), configurable with `GGML_OPENVINO_PREFILL_CHUNK_SIZE`. Large prefill/batch settings may need tuning.
+- `llama-server -np > 1` (multiple parallel sequences) is not supported.
+- `llama-perplexity`: requires `-b 512` or smaller.
+
+> [!NOTE]
+> The OpenVINO backend is actively under development. Fixes and improvements are underway, and this document will continue to be updated.

 ## Work in Progress

@@ -161,6 +161,64 @@ You could update your test result in it directly.

 Please refer to [Docker with SYCL](../docker.md#docker-with-sycl) for details.

+## Quick Development WOW
+
+This chapter is for quick development & try with SYCL backend on Intel GPU.
+
+You need to install following sofeware before development:
+   - Intel GPU driver
+   - oneAPI package
+   - other development tools.
+
+Please refer to [Linux](#linux) or [Windows](#windows-1) for above installation and resolve the trouble in usage. There are the detailed guide.
+
+- Linux
+
+```
+## build from source code
+./examples/sycl/build.sh
+
+## run CONV_2D_DW unit test cases
+./build/bin/test-backend-ops -b SYCL0 -o CONV_2D_DW
+
+## run all unit test cases
+./build/bin/test-backend-ops -b SYCL0
+
+## run with LLM on the first GPU
+./examples/sycl/test.sh -mg 0 -m xxxx.gguf
+
+## run service with LLM on the first GPU
+export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+./examples/sycl/start-svr.sh -m xxxx.gguf
+
+## update the docs/ops.md for new/update OPs
+./examples/sycl/update-ops-doc.sh
+```
+
+- Windows
+
+```
+## build from source code
+examples\sycl\win-build-sycl.bat
+
+## run CONV_2D_DW unit test cases
+build\bin\test-backend-ops.exe -b SYCL0 -o CONV_2D_DW
+
+## run all unit test cases
+build\bin\test-backend-ops.exe -b SYCL0
+
+## run LLM on the first GPU
+examples\sycl\win-test.bat -mg 0 -m xxxx.gguf
+
+## run service with LLM on the first GPU
+set ONEAPI_DEVICE_SELECTOR="level_zero:0"
+examples\sycl\win-start-svr.bat -m xxxx.gguf
+
+## update the docs/ops.md for new/update OPs
+examples\sycl\win-update-ops-doc.bat
+```
+
+
 ## Linux

 ### I. Setup Environment
@@ -253,6 +311,7 @@ When targeting an intel GPU, the user should expect one or more devices among th
 #### Intel GPU

 ```sh
+# Uses FP32, consider using FP16 for better performance in most cases
 ./examples/sycl/build.sh
 ```

@@ -262,12 +321,12 @@ or
 # Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh

-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
+# Option 1: Use FP16 (recommended for better performance in most cases)
 cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON

+# Option 2: Use FP32
+cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
 # build all binary
 cmake --build build --config Release -j -v
 ```
@@ -469,6 +528,7 @@ Choose one of following methods to build from source code.
 ##### Option 1: Script

 ```sh
+# Uses FP32, consider using FP16 for better performance in most cases
 .\examples\sycl\win-build-sycl.bat
 ```

@@ -479,11 +539,11 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
 ```
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+# Option 1: Use FP16 (recommended for better performance in most cases)
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON

-# Option 2: Or FP16
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+# Option 2: Or FP32
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release

 cmake --build build --config Release -j
 ```
@@ -491,10 +551,10 @@ cmake --build build --config Release -j
 Or, use CMake presets to build:

 ```sh
-cmake --preset x64-windows-sycl-release
+cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-completion

-cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
+cmake --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-completion

 cmake --preset x64-windows-sycl-debug
@@ -699,7 +759,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_GRAPH    | ON *(default)* \|OFF *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
-| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
+| GGML_SYCL_SUPPORT_LEVEL_ZERO_API | ON *(default)* \|OFF *(Optional)* | Support to use Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

@@ -710,14 +770,16 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
+| GGML_SYCL_DEV2DEV_MEMCPY | 0 (default) or 1 | Choose the SYCL or L0 API in dev2dev memory copy.<br>Value: <br>*  0: SYCL API (default)<br>* 1: L0 API -- L0 API is found to lead to abnormal crash in some case. This debug flag is used to check the issue.|
 | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
-| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
+| GGML_SYCL_USE_LEVEL_ZERO_API | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO_API=ON at build time. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
+| GGML_SYCL_USM_SYSTEM | 0 (default) or 1 | Enable experimental support for [USM system allocations](https://github.khronos.org/SYCL_Reference/iface/usm_basic_concept.html#system-allocations) for large GPU buffers. This requires enough host memory for model weights and caches, an Intel Xe2+ GPU such as BMG or newer and supported on Linux only, with CONFIG_DRM_XE_GPUSVM enabled. |

 ## Compile-time Flags

@@ -728,6 +790,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
 | DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
 | DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |

+
 ## Design Rule

 - Open to all contributors.
@@ -1,12 +1,40 @@
 # Install pre-built version of llama.cpp

-| Install via | Windows | Mac | Linux |
-|-------------|---------|-----|-------|
+| Install via | Windows | Mac  | Linux |
+|-------------|---------|------|-------|
+| conda-forge | ✅      | ✅   | ✅   |
 | Winget      | ✅      |      |      |
 | Homebrew    |         | ✅   | ✅   |
 | MacPorts    |         | ✅   |      |
 | Nix         |         | ✅   | ✅   |

+## conda-forge (Windows, Mac and Linux)
+
+conda-forge provides builds for:
+ - CUDA (Windows and Linux)
+ - Vulkan (Windows and Linux)
+ - Apple Metal (macOS)
+
+```sh
+conda install -c conda-forge llama-cpp
+```
+
+```sh
+mamba install -c conda-forge llama-cpp
+```
+
+```sh
+# Project-local installation
+pixi add llama-cpp
+
+# Global installation
+pixi global install llama-cpp
+```
+
+This distribution is managed on [`conda-forge/llama-cpp-feedstock`](https://github.com/conda-forge/llama.cpp-feedstock/).
+
+Shall you have any problems, please open an issue on [its issue tracker](https://github.com/conda-forge/llama.cpp-feedstock/issues).
+
 ## Winget (Windows)

 ```sh
@@ -1,10 +1,11 @@
 # Multimodal

 llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
- [llama-mtmd-cli](../tools/mtmd/README.md)
+- [llama-cli](../tools/cli/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
+- [llama-mtmd-cli](../tools/mtmd/README.md), for testing and development

-Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
+Currently, we support **image**, **audio** and **video** input.

 To enable it, you can use one of the 2 methods below:

@@ -23,15 +23,16 @@ Legend:
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                             CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                        COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -43,10 +44,10 @@ Legend:
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -64,7 +65,7 @@ Legend:
 |                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
@@ -77,7 +78,7 @@ Legend:
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
 |                              PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -88,7 +89,7 @@ Legend:
 |                             ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -98,13 +99,13 @@ Legend:
 |                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -117,6 +118,6 @@ Legend:
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
@@ -8,55 +8,53 @@ The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/lla

 When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.

-### Using a Remote Preset
+### Using a Hugging Face Preset

-> [!NOTE]
+> [!IMPORTANT]
 >
-> This feature is currently only supported via the `-hf` option.
+> Please only use presets that you can trust! Unknown presets may be unsafe

-For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.
+You can push your preset to Hugging Face Hub and share with other users by:
+1. Creating an empty model repository on Hugging Face
+2. Creating a `preset.ini` file in the root directory of the repository

-Example:
+Example of a `preset.ini`:

 ```ini
-hf-repo-draft = username/my-draft-model-GGUF
-temp = 0.5
-top-k = 20
-top-p = 0.95
+[*]
+ctx-size             = 0
+mmap                 = 1
+kv-unified           = 1
+parallel             = 4
+spec-default         = 1
+
+[Qwen3.5-4B]
+hf                   = unsloth/Qwen3.5-4B-GGUF:Q4_K_M
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+top-p                = 1.0
+top-k                = 0
+min-p                = 0.01
+temp                 = 1.0
+
+[gpt-oss-120b-hf]
+hf                   = ggml-org/gpt-oss-120b-GGUF
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+top-p                = 1.0
+top-k                = 0
+min-p                = 0.01
+temp                 = 1.0
+chat-template-kwargs = {"reasoning_effort": "high"}
 ```

-For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
-
-Example usage:
-
-Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
-
-```sh
-llama-cli -hf username/my-model-with-preset
-
-# This is equivalent to:
-llama-cli -hf username/my-model-with-preset \
-  --hf-repo-draft username/my-draft-model-GGUF \
-  --temp 0.5 \
-  --top-k 20 \
-  --top-p 0.95
-```
-
-You can also override preset arguments by specifying them on the command line:
+The preset will be loaded similarly to the `--models-preset` option. Therefore, you can also override certain params via CLI arguments:

 ```sh
 # Force temp = 0.1, overriding the preset value
-llama-cli -hf username/my-model-with-preset --temp 0.1
-```
-
-If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
-
-```ini
-hf-repo = user/my-model-main
-hf-repo-draft = user/my-model-draft
-temp = 0.8
-ctx-size = 1024
-; (and other configurations)
+llama-cli -hf username/my-preset --temp 0.1
 ```

 ### Named presets
@@ -3,15 +3,45 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT

+print_usage() {
+    echo "Usage: ./build.sh [fp32|fp16] [--help]"
+    echo ""
+    echo "Options:"
+    echo "  fp32    Build with FP32 precision (default)"
+    echo "  fp16    Build with FP16 precision (faster for long-prompt inference)"
+    echo "  --help  Print this help message"
+}
+
+PRECISION=fp32
+
+for arg in "$@"; do
+    case "$arg" in
+        --help)
+            print_usage
+            exit 0
+            ;;
+        fp32|fp16)
+            PRECISION="$arg"
+            ;;
+        *)
+            echo "Error: unknown option '$arg'"
+            print_usage
+            exit 1
+            ;;
+    esac
+done
+
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh

-#for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
-
-#for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
+if [ "$PRECISION" = "fp16" ]; then
+    #for FP16
+    cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
+else
+    #for FP32
+    cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
+fi

 #build example/main
 #cmake --build . --config Release --target main
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2026 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+./build/bin/test-backend-ops support --output csv > docs/ops/SYCL.csv
+./scripts/create_ops_docs.py
+
@@ -3,6 +3,23 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT

+IF /I "%1"=="--help" (
+    echo Usage: win-build-sycl.bat [fp32^|fp16] [--help]
+    echo.
+    echo Options:
+    echo   fp32    Build with FP32 precision ^(default^)
+    echo   fp16    Build with FP16 precision ^(faster for long-prompt inference^)
+    echo   --help  Print this help message
+    exit /B 0
+)
+
+SET PRECISION=%1
+IF "%PRECISION%"=="" SET PRECISION=fp32
+IF /I NOT "%PRECISION%"=="fp32" IF /I NOT "%PRECISION%"=="fp16" (
+    echo Error: invalid value '%PRECISION%'. Use 'fp32' or 'fp16'.
+    echo Usage: win-build-sycl.bat [fp32^|fp16] [--help]
+    exit /B 1
+)

 IF not exist build (mkdir build)
 cd build
@@ -11,12 +28,14 @@ if %errorlevel% neq 0 goto ERROR
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 if %errorlevel% neq 0 goto ERROR

-::  for FP16
-::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
-
-::  for FP32
-cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+IF /I "%PRECISION%"=="fp16" (
+    ::  for FP16
+    ::  faster for long-prompt inference
+    cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+) ELSE (
+    ::  for FP32
+    cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+)
 if %errorlevel% neq 0 goto ERROR

 ::  build all binary
@@ -0,0 +1,8 @@
+@echo off
+
+rem MIT license
+rem Copyright (C) 2026 Intel Corporation
+rem SPDX-License-Identifier: MIT
+
+build\bin\test-backend-ops support --output csv > docs\ops\SYCL.csv
+python scripts\create_ops_docs.py
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 15)
-set(GGML_VERSION_PATCH 1)
+set(GGML_VERSION_PATCH 2)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -249,7 +249,7 @@ option(GGML_SYCL                            "ggml: use SYCL"
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
 option(GGML_SYCL_HOST_MEM_FALLBACK          "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
-option(GGML_SYCL_SUPPORT_LEVEL_ZERO         "ggml: use Level Zero API in SYCL backend"  ON)
+option(GGML_SYCL_SUPPORT_LEVEL_ZERO_API     "ggml: use Level Zero API in SYCL backend"  ON)
 option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
@@ -438,7 +438,14 @@ if (GGML_CPU_ALL_VARIANTS)
            ggml_add_cpu_backend_variant(power8_2       POWER8  VSX)
            ggml_add_cpu_backend_variant(power9         POWER9  VSX)
            ggml_add_cpu_backend_variant(power10        POWER10 VSX)
-            ggml_add_cpu_backend_variant(power11        POWER11 VSX)
+            # POWER11 backend: only if compiler supports -mcpu=power11
+            check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11)
+            if (GGML_CXX_SUPPORTS_POWER11)
+                message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend")
+                ggml_add_cpu_backend_variant(power11 POWER11 VSX)
+            else()
+                message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11")
+            endif()
        else()
            message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
        endif()
@@ -389,7 +389,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

-            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
+            if (EXTRACTED_NUMBER EQUAL 10 OR EXTRACTED_NUMBER EQUAL 11)
                list(APPEND ARCH_FLAGS -mcpu=power10)
            elseif (EXTRACTED_NUMBER EQUAL 9)
                list(APPEND ARCH_FLAGS -mcpu=power9)
@@ -293,7 +293,6 @@
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__wasm__)
 // quants.c
-#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
@@ -2345,7 +2345,7 @@ class tinyBLAS_Q0_PPC {
            else if (n_aligned % 16 == 0) nc = 16;
            else                          nc = 8;
        }
-        bool can_use_tiled = n_aligned > 0 && (m % mc == 0) && (k % kc == 0);
+        bool can_use_tiled = n_aligned > 0 && (m % mc == 0);
        if (can_use_tiled) {
            matmul_tiled(m, n_aligned, mc, nc, kc);
            if (n > n_aligned) {
@@ -3063,13 +3063,14 @@ class tinyBLAS_Q0_PPC {
            int64_t ii = (job / xtiles) * mc;
            int64_t jj = (job % xtiles) * nc;
            for (int64_t kk = 0; kk < k; kk += kc) {
+                int64_t k_cur = MIN(kc, k - kk);
                if constexpr(is_Ablock_q4) {
-                    packNormal_q4_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
+                    packNormal_q4_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
                } else {
-                    packNormal_q8_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
+                    packNormal_q8_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
                }
-                packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, kc, (uint8_t *)B_pack);
-                KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack);
+                packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, k_cur, (uint8_t *)B_pack);
+                KERNEL_Q0(ii, jj, mc, nc, k_cur, kk, A_pack, B_pack);
            }
        }
    }
@@ -0,0 +1,81 @@
+#include "col2im-1d.cuh"
+#include "convert.cuh"
+
+// col2im_1d: scatter-add GEMM columns to 1D signal (gather approach)
+// columns: [K*OC, T_in]  ->  output: [T_out, OC]
+// Supports F32, F16, BF16 data with F32 accumulator.
+
+template <typename T>
+static __global__ void col2im_1d_kernel(
+        const T * __restrict__ col,
+        T       * __restrict__ dst,
+        const int T_in, const uint3 T_out_fd,
+        const int OC, const int K, const int K_OC,
+        const int s0, const int p0, const int total) {
+
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= total) return;
+
+    // dst layout: [T_out, OC], ne[0]=T_out fastest
+    const uint2 qr  = fast_div_modulo((uint32_t)idx, T_out_fd);  // qr.x = idx / T_out, qr.y = idx % T_out
+    const int oc    = (int)qr.x;
+    const int t_out = (int)qr.y;
+    const int t_abs = t_out + p0;  // absolute position in uncropped signal
+
+    // Gather: find all (t_in, k) where t_in*s + k == t_abs, 0 <= k < K
+    int t_in_min = (t_abs - K + s0) / s0;  // ceil((t_abs - K + 1) / s)
+    if (t_in_min < 0) t_in_min = 0;
+    int t_in_max = t_abs / s0;
+    if (t_in_max >= T_in) t_in_max = T_in - 1;
+
+    float sum = 0.0f;
+    for (int t_in = t_in_min; t_in <= t_in_max; t_in++) {
+        const int k = t_abs - t_in * s0;
+        // col layout: [K*OC, T_in], column index = oc * K + k
+        sum += ggml_cuda_cast<float>(col[(oc * K + k) + t_in * K_OC]);
+    }
+
+    dst[idx] = ggml_cuda_cast<T>(sum);
+}
+
+void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t OC = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+
+    const int K_OC = (int) src0->ne[0];
+    const int T_in = (int) src0->ne[1];
+    const int K    = K_OC / OC;
+    const int T_out = (int) dst->ne[0];
+
+    const uint3 T_out_fd = init_fastdiv_values((uint32_t)T_out);
+
+    const int total = T_out * OC;
+    const int block_size = 256;
+    const int num_blocks = (total + block_size - 1) / block_size;
+
+    switch (src0->type) {
+        case GGML_TYPE_F32: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const float *)src0->data, (float *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        case GGML_TYPE_F16: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const half *)src0->data, (half *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        case GGML_TYPE_BF16: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        default:
+            GGML_ABORT("col2im_1d: unsupported type");
+    }
+}
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -11,6 +11,7 @@
 #include "ggml-cuda/argsort.cuh"
 #include "ggml-cuda/binbcast.cuh"
 #include "ggml-cuda/clamp.cuh"
+#include "ggml-cuda/col2im-1d.cuh"
 #include "ggml-cuda/concat.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
 #include "ggml-cuda/conv2d.cuh"
@@ -622,18 +623,6 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {

 // cuda buffer

-struct ggml_backend_cuda_device_context {
-    int device;
-    std::string name;
-    std::string description;
-    std::string pci_bus_id;
-    int op_offload_min_batch_size;
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    std::mutex device_mutex;
-    int active_count = 0;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-};
-
 struct ggml_backend_cuda_buffer_context {
    int device;
    void * dev_ptr = nullptr;
@@ -651,13 +640,6 @@ struct ggml_backend_cuda_buffer_context {

 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count--;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    delete ctx;
 }

@@ -810,12 +792,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac

    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count++;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 }

@@ -1515,12 +1491,6 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
 }

 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count--;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    CUDA_CHECK(cudaFreeHost(buffer->context));
 }

@@ -1529,8 +1499,6 @@ static void * ggml_cuda_host_malloc(size_t size) {
        return nullptr;
    }

-    ggml_cuda_set_device(0); // cudaMallocHost can create the implicit CUDA device context, make sure that this is consistently done on device 0.
-
    void * ptr = nullptr;
    cudaError_t err = cudaMallocHost((void **) &ptr, size);
    if (err != cudaSuccess) {
@@ -1556,12 +1524,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
    buffer->buft = buft;
    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count++;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    return buffer;
 }

@@ -3090,6 +3052,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CONV_TRANSPOSE_1D:
            ggml_cuda_op_conv_transpose_1d(ctx,dst);
            break;
+        case GGML_OP_COL2IM_1D:
+            ggml_cuda_op_col2im_1d(ctx, dst);
+            break;
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
@@ -3179,12 +3144,6 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
 static void ggml_backend_cuda_free(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) backend->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count--;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    delete cuda_ctx;
    delete backend;
 }
@@ -4916,6 +4875,14 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {

 // backend device

+struct ggml_backend_cuda_device_context {
+    int device;
+    std::string name;
+    std::string description;
+    std::string pci_bus_id;
+    int op_offload_min_batch_size;
+};
+
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    return ctx->name.c_str();
@@ -5004,11 +4971,6 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k

 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    std::lock_guard<std::mutex> lock(ctx->device_mutex);
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaMemGetInfo(free, total));

@@ -5035,13 +4997,6 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
    }
 #endif // defined(__linux__)

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    // If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA
-    // context that permanently consumes VRAM. Reset the device to free it.
-    if (ctx->active_count == 0) {
-        CUDA_CHECK(cudaDeviceReset());
-    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 }

 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
@@ -5337,8 +5292,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            } break;
        case GGML_OP_REPEAT:
            {
+                // the CUDA REPEAT path only implements F32/F16; other types assert at runtime
                ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+                return src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16;
            } break;
        case GGML_OP_REPEAT_BACK:
                return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
@@ -5364,6 +5320,14 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                }
                return false;
            } break;
+        case GGML_OP_COL2IM_1D:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16 || src0_type == GGML_TYPE_BF16) &&
+                    op->type == src0_type &&
+                    ggml_is_contiguous(op->src[0]) &&
+                    ggml_is_contiguous(op);
+            } break;
        case GGML_OP_SILU_BACK:
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
            break;
@@ -5744,21 +5708,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
        return nullptr;
    }

-    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device);
-
    ggml_backend_t cuda_backend = new ggml_backend {
        /* .guid    = */ ggml_backend_cuda_guid(),
        /* .iface   = */ ggml_backend_cuda_interface,
-        /* .device  = */ dev,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
        /* .context = */ ctx,
    };

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count++;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    return cuda_backend;
 }

@@ -69,6 +69,7 @@ static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
 static int opt_oppoll   = 0;    // polling for batch completions
+static int opt_optrace  = 0;    // trace buffer size per thread (0 means default)

 static std::regex* opt_opfilter = NULL; // regex of ops to not claim

@@ -118,20 +119,39 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
                ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
 }

+static const char * htp_event_name(uint16_t id) {
+    switch (id) {
+        case HTP_TRACE_EVT_DMA:            return "DMA";
+        case HTP_TRACE_EVT_HVX_COMP:       return "HVX_COMP";
+        case HTP_TRACE_EVT_HVX_A_QUANT:    return "HVX_A_QUANT";
+        case HTP_TRACE_EVT_HVX_A_PREP:     return "HVX_A_PREP";
+        case HTP_TRACE_EVT_HVX_W_DEQUANT:  return "HVX_W_DEQUANT";
+        case HTP_TRACE_EVT_HVX_W_PREP:     return "HVX_W_PREP";
+        case HTP_TRACE_EVT_HVX_O_PROC:     return "HVX_O_PROC";
+        case HTP_TRACE_EVT_HMX_COMP:       return "HMX_COMP";
+        default:                           return "UNKNOWN";
+    }
+}
+
 static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
-                                      uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
+                                      const htp_prof_desc & pd) {
    if (!opt_profile) return;

+    uint32_t op_usec = pd.usecs;
+    uint32_t op_cycles = pd.cycles_stop - pd.cycles_start;
+    const uint32_t * pmu = pd.pmu;
+
    char pmu_str[256] = "";
-    if (opt_profile > 1) {
+    if (opt_profile == 2) {
        static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
        sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
                pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
    }

    htp_opformat fmt(node);
-    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
-            node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
+    float mhz = op_usec > 0 ? (float) op_cycles / op_usec : 0.0f;
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u start %u mhz %.1f%s\n", sess_name.c_str(),
+            node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pd.cycles_start, mhz, pmu_str);
 }

 // ** backend sessions
@@ -1995,10 +2015,16 @@ struct ggml_hexagon_opqueue {
        size_t n_ops     = batch_size;
        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;

+        size_t tr_size = 0;
+        if (opt_profile == 3) {
+            tr_size = (HTP_MAX_NTHREADS + 1) * opt_optrace * sizeof(htp_trace_desc);
+        }
+
        shm_blk_size = sizeof(htp_buf_desc)  * n_bufs    +
                       sizeof(htp_tensor)    * n_tensors +
                       sizeof(htp_op_desc)   * n_ops     +
-                       sizeof(htp_prof_desc) * n_ops;
+                       sizeof(htp_prof_desc) * n_ops     +
+                       tr_size;

        shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);

@@ -2042,11 +2068,19 @@ struct ggml_hexagon_opqueue {
        const size_t o_size = sizeof(htp_op_desc)   * req.n_ops;
        const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;

+        size_t tr_size = 0;
+        if (opt_profile == 3) {
+            req.n_traces = opt_optrace;
+            tr_size = (HTP_MAX_NTHREADS + 1) * req.n_traces * sizeof(htp_trace_desc);
+        } else {
+            req.n_traces = 0;
+        }
+
        dbuf.ptr      = shm_buf->base + (req.id * shm_blk_size);
        dbuf.fd       = shm_buf->fd;
        dbuf.flags    = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
        dbuf.offset   = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
-        dbuf.size     = b_size + t_size + o_size + p_size;
+        dbuf.size     = b_size + t_size + o_size + p_size + tr_size;

        GGML_ASSERT(dbuf.size <= shm_blk_size);

@@ -2092,7 +2126,14 @@ struct ggml_hexagon_opqueue {
        const size_t o_size = sizeof(htp_op_desc)   * rsp.n_ops;
        const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;

-        const size_t m_size = b_size + t_size + o_size + p_size;
+        size_t tr_size = 0;
+        uint32_t n_traces = 0;
+        if (opt_profile == 3) {
+            n_traces = opt_optrace;
+            tr_size = (HTP_MAX_NTHREADS + 1) * n_traces * sizeof(htp_trace_desc);
+        }
+
+        const size_t m_size = b_size + t_size + o_size + p_size + tr_size;
        GGML_ASSERT(m_size <= shm_blk_size);

        HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
@@ -2111,13 +2152,62 @@ struct ggml_hexagon_opqueue {
            GGML_ASSERT(rsp.n_ops <= ops.size());

            const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
-            for (uint32_t i = 0; i < rsp.n_ops; i++) {
-                htp_usec += pd[i].usecs;
-                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
+
+            const htp_trace_desc * trace_events = nullptr;
+
+            if (opt_profile == 3) {
+                trace_events = (const htp_trace_desc *) (p_ptr + p_size);
            }

-            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
-                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
+            uint32_t trace_idx[HTP_MAX_NTHREADS + 1] = {0};
+            uint32_t valid_cnt[HTP_MAX_NTHREADS + 1] = {0};
+
+            if (opt_profile == 3) {
+                for (uint32_t t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                    uint32_t count = rsp.n_traces[t];
+                    valid_cnt[t] = count > n_traces ? n_traces : count;
+                }
+            }
+
+            for (uint32_t i = 0; i < rsp.n_ops; i++) {
+                htp_usec += pd[i].usecs;
+
+                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i]);
+
+                if (opt_profile == 3) {
+                    uint32_t op_duration = pd[i].cycles_stop - pd[i].cycles_start;
+
+                    for (uint32_t t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                        while (trace_idx[t] < valid_cnt[t]) {
+                            const auto & e = trace_events[t * n_traces + trace_idx[t]];
+                            uint32_t offset = e.cycles - pd[i].cycles_start;
+                            if (offset >= 0x80000000) {
+                                trace_idx[t]++;
+                                continue;
+                            }
+                            if (offset > op_duration) {
+                                break;
+                            }
+                            bool is_stop = (e.info & 0x8000) != 0;
+                            uint16_t info = e.info & 0x7FFF;
+                            GGML_LOG_DEBUG("ggml-hex: %s trace-op %s: thread %u event %s info %u %s %u\n",
+                                           shm_buf->sess->c_name(), ops[i].op_name().c_str(), t, htp_event_name(e.id), info, is_stop ? "stop" : "start", e.cycles);
+                            trace_idx[t]++;
+                        }
+                    }
+                }
+            }
+
+            char evt_str[256] = "";
+            if (opt_profile == 3) {
+                sprintf(evt_str, " evt [%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u]",
+                        rsp.n_traces[0], rsp.n_traces[1], rsp.n_traces[2], rsp.n_traces[3],
+                        rsp.n_traces[4], rsp.n_traces[5], rsp.n_traces[6], rsp.n_traces[7],
+                        rsp.n_traces[8], rsp.n_traces[9], rsp.n_traces[10]);
+            }
+
+            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u%s\n",
+                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec, evt_str);
        }
    }
 };
@@ -3901,6 +3991,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
    const char * str_opbatch  = getenv("GGML_HEXAGON_OPBATCH");
    const char * str_opqueue  = getenv("GGML_HEXAGON_OPQUEUE");
    const char * str_oppoll   = getenv("GGML_HEXAGON_OPPOLL");
+    const char * str_optrace  = getenv("GGML_HEXAGON_OPTRACE");
    const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
    const char * str_profile  = getenv("GGML_HEXAGON_PROFILE");
    const char * str_etm      = getenv("GGML_HEXAGON_ETM");
@@ -3939,6 +4030,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
    opt_opbatch   = str_opbatch  ? strtoul(str_opbatch, NULL, 0)          : opt_opbatch;
    opt_opqueue   = str_opqueue  ? strtoul(str_opqueue, NULL, 0)          : opt_opqueue;
    opt_oppoll    = str_oppoll   ? strtoul(str_oppoll,  NULL, 0)          : opt_oppoll;
+    opt_optrace   = str_optrace  ? strtoul(str_optrace, NULL, 0)          : (opt_opbatch * 128);
    opt_profile   = str_profile  ? atoi(str_profile)                      : 0;
    opt_etm       = str_etm      ? atoi(str_etm)                          : 0;
    opt_nhvx      = str_nhvx     ? strtoul(str_nhvx, NULL, 0)             : opt_nhvx;
@@ -37,8 +37,8 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)

 if (_hmx_idx GREATER_EQUAL 0)
    target_sources(${HTP_LIB} PRIVATE
-        hmx-matmul-ops.c
        hmx-flash-attn-ops.c
+        hmx-matmul-ops.c
        hmx-queue.c
    )

@@ -339,6 +339,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *

    if (ir0 >= ir1) return;

+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
+
    dma_queue * dma = octx->ctx->dma[ith];

    const uint32_t DK = nek0;
@@ -615,6 +618,7 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
            hvx_copy_f16_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
        }
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
 }

 int op_flash_attn_ext(struct htp_ops_context * octx) {
@@ -6,6 +6,8 @@
 #include <stdbool.h>
 #include <stdint.h>

+#include "hex-profile.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -88,6 +90,7 @@ typedef struct {
    uint32_t            pop_idx;
    uint32_t            capacity;
    uint32_t            idx_mask;
+    struct htp_thread_trace * trace;
 } dma_queue;

 dma_queue * dma_queue_create(size_t capacity);
@@ -152,6 +155,7 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t
    q->dptr[q->push_idx] = dptr;

    if (size) {
+        htp_trace_event_start(q->trace, HTP_TRACE_EVT_DMA, q->push_idx);
        dmlink(q->tail, desc);
        q->tail = (dma_descriptor_2d *) desc;
    } else {
@@ -202,6 +206,7 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t
    q->dptr[q->push_idx] = dptr;

    if (nrows) {
+        htp_trace_event_start(q->trace, HTP_TRACE_EVT_DMA, q->push_idx);
        dmlink(q->tail, desc);
        q->tail = desc;
    } else {
@@ -223,10 +228,12 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
    dma_descriptor_2d * desc = &q->desc[q->pop_idx];

    // Wait for desc to complete
-    while (!desc->done) {
-        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
-        dmpoll();
+    if (!desc->done) {
+        while (!desc->done) {
+            dmpoll();
+        }
    }
+    htp_trace_event_stop(q->trace, HTP_TRACE_EVT_DMA, q->pop_idx);

    dptr = q->dptr[q->pop_idx];

@@ -0,0 +1,64 @@
+#ifndef HEX_PROFILE_H
+#define HEX_PROFILE_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <qurt.h>
+
+#include "hex-utils.h"
+#include "htp-ops.h"
+
+#define HTP_TRACE_EVT_START 0
+#define HTP_TRACE_EVT_STOP  1
+
+#ifndef HEX_NUM_PMU_COUNTERS
+#define HEX_NUM_PMU_COUNTERS 8
+#endif
+
+static inline void hex_get_pmu(uint32_t counters[]) {
+#if __HVX_ARCH__ >= 79
+    asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
+    asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
+    asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
+    asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
+    asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
+    asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
+    asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
+    asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
+#else
+    counters[0] = qurt_pmu_get(QURT_PMUCNT0);
+    counters[1] = qurt_pmu_get(QURT_PMUCNT1);
+    counters[2] = qurt_pmu_get(QURT_PMUCNT2);
+    counters[3] = qurt_pmu_get(QURT_PMUCNT3);
+    counters[4] = qurt_pmu_get(QURT_PMUCNT4);
+    counters[5] = qurt_pmu_get(QURT_PMUCNT5);
+    counters[6] = qurt_pmu_get(QURT_PMUCNT6);
+    counters[7] = qurt_pmu_get(QURT_PMUCNT7);
+#endif
+}
+
+struct htp_thread_trace {
+    uint32_t count;
+    uint32_t max_events;
+    struct htp_trace_desc * events;
+};
+
+static inline void htp_trace_event(struct htp_thread_trace * tr, uint16_t id, uint16_t info, uint32_t type) {
+    if (tr && tr->events && tr->count < tr->max_events) {
+        uint32_t idx = tr->count;
+        tr->events[idx].id = id;
+        tr->events[idx].info = info | (type == HTP_TRACE_EVT_STOP ? 0x8000 : 0);
+        tr->events[idx].cycles = (uint32_t) hex_get_cycles();
+        tr->count++;
+    }
+}
+
+static inline void htp_trace_event_start(struct htp_thread_trace * tr, uint16_t id, uint16_t info) {
+    htp_trace_event(tr, id, info, HTP_TRACE_EVT_START);
+}
+
+static inline void htp_trace_event_stop(struct htp_thread_trace * tr, uint16_t id, uint16_t info) {
+    htp_trace_event(tr, id, info, HTP_TRACE_EVT_STOP);
+}
+
+#endif /* HEX_PROFILE_H */
@@ -107,31 +107,4 @@ static inline void hex_pause() {
    asm volatile(" pause(#255)\n");
 }

-#ifndef HEX_NUM_PMU_COUNTERS
-#define HEX_NUM_PMU_COUNTERS 8
-#endif
-
-static inline void hex_get_pmu(uint32_t counters[]) {
-#if __HVX_ARCH__ >= 79
-    asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
-    asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
-    asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
-    asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
-    asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
-    asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
-    asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
-    asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
-#else
-    counters[0] = qurt_pmu_get(QURT_PMUCNT0);
-    counters[1] = qurt_pmu_get(QURT_PMUCNT1);
-    counters[2] = qurt_pmu_get(QURT_PMUCNT2);
-    counters[3] = qurt_pmu_get(QURT_PMUCNT3);
-    counters[4] = qurt_pmu_get(QURT_PMUCNT4);
-    counters[5] = qurt_pmu_get(QURT_PMUCNT5);
-    counters[6] = qurt_pmu_get(QURT_PMUCNT6);
-    counters[7] = qurt_pmu_get(QURT_PMUCNT7);
-    // qurt_pmu_get_pmucnt(counters);
-#endif
-}
-
 #endif /* HEX_UTILS_H */
@@ -18,7 +18,7 @@
 #include "ggml-common.h"
 #include "hex-dma.h"
 #include "hex-fastdiv.h"
-#include "hmx-profile.h"
+#include "hex-profile.h"
 #include "hmx-queue.h"
 #include "hmx-utils.h"
 #include "htp-ctx.h"
@@ -367,8 +367,11 @@ static void fa_k_interleave_thread(unsigned int n, unsigned int i, void * data)
        return;
    }

+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
    hmx_interleave_rows_to_tiles(factx->vtcm_k_tiles, factx->vtcm_k_fp16[args->buf_idx], total_rows, (int) factx->DK,
                             (int) args->src_stride, start, end);
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start);
 }

 static void fa_phase_k_interleave(struct hmx_fa_context * factx, int kv_rows, size_t src_stride, size_t buf_idx) {
@@ -408,8 +411,11 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data)

    __fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];

+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
    hmx_interleave_cols_to_tiles(v_tiles_dest, factx->vtcm_v_fp16[args->buf_idx], total_rows, (int) factx->DV,
                             (int) args->src_stride, (int) args->n_col_tiles, start, end);
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start);
 }

 static void fa_phase_v_interleave(struct hmx_fa_context * factx,
@@ -462,6 +468,9 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) {
        return;
    }

+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
+
    const struct htp_tensor * q       = args->q;
    const uint32_t            q_start = args->q_start;
    const uint32_t            kv_head = args->kv_head;
@@ -515,6 +524,7 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) {
            }
        }
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start);
 }

 static void fa_phase_q_load(struct hmx_fa_context *   factx,
@@ -566,6 +576,9 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) {
        return;
    }

+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
+
    const struct htp_tensor * dst        = args->dst;
    const __fp16 *            o_tile_src = args->o_tile_src;
    const uint32_t            q_start    = args->q_start;
@@ -611,6 +624,7 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) {
            }
        }
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start);
 }

 static void fa_phase_o_store(struct hmx_fa_context *   factx,
@@ -680,6 +694,9 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
        return;
    }

+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, vec_start);
+
    // Per-thread row scratch: thread i uses bufs at offset i * 2 * stride
    const size_t row_buf_stride = factx->row_buf_stride;
    HVX_Vector * my_row_buf0    = factx->vtcm_row_bufs + i * 2 * row_buf_stride;
@@ -950,6 +967,7 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
        factx->vtcm_s_rowmax[r_vec_idx] = rowmax_acc_v;
        factx->vtcm_p_rowsum[r_vec_idx] = rowsum_acc_v;
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, vec_start);
 }

 // Serial m/l update + build_D.  Must run after softmax barrier (s_rowmax written by all threads).
@@ -1245,6 +1263,7 @@ static __attribute__((noinline)) void fa_compute_slopes(
 // ============================================================================

 int hmx_flash_attn_ext(struct htp_ops_context * octx) {
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[HTP_MAX_NTHREADS] : NULL;
    const struct htp_tensor * q    = octx->src[0];
    const struct htp_tensor * k    = octx->src[1];
    const struct htp_tensor * v    = octx->src[2];
@@ -1422,19 +1441,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
        return HTP_STATUS_OK;
    }

-    // Profiling timers
-    TIMER_DEFINE(total);
-    TIMER_DEFINE(q_load);
-    TIMER_DEFINE(kv_dma);
-    TIMER_DEFINE(k_interleave);
-    TIMER_DEFINE(v_interleave);
-    TIMER_DEFINE(qk_dot);
-    TIMER_DEFINE(softmax);
-    TIMER_DEFINE(o_update);
-    TIMER_DEFINE(o_norm);
-    TIMER_DEFINE(o_store);
-
-    TIMER_START(total);

    // ======== DMA setup ========
    dma_queue * const dma = ctx->dma[0];
@@ -1474,12 +1480,10 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                const size_t   n_row_tiles = g_br_actual / HMX_FP16_TILE_N_ROWS;

                // ---- Load Q block [g_br, D] -> tiles, interleaving G heads ----
-                TIMER_START(q_load);
                if (n_rows_g < g_br) {
                    hvx_splat_u8_a(factx.vtcm_q_tiles, 0, q_tile_bytes);
                }
                fa_phase_q_load(&factx, q, q_start, kv_head, ib3, n_rows_g);
-                TIMER_STOP(q_load);

                // ---- Initialize per-block state ----
                hvx_splat_u8_a(factx.vtcm_l_vec,   0,      col_vec_bytes);
@@ -1558,10 +1562,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                        const size_t   n_col_tiles = hmx_ceil_div(kv_rows, HMX_FP16_TILE_N_COLS);

                        // Wait for current KV DMA
-                        TIMER_START(kv_dma);
                        dma_queue_pop(dma);  // K
                        dma_queue_pop(dma);  // V
-                        TIMER_STOP(kv_dma);

                        // Push mask DMA for this block (single 2D DMA when broadcast)
                        bool has_mask_dma = false;
@@ -1583,10 +1585,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                            ou_job.DV               = DV;
                            hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_o_update_worker, &ou_job));
                        }
-
-                        TIMER_START(k_interleave);
                        fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx);
-                        TIMER_STOP(k_interleave);

                        // ---- Phase 2: qk_dot(blk) on HMX ‖ V_int(blk) + DMA prefetch on HVX ----
                        qk_job.q_tiles        = factx.vtcm_q_tiles;
@@ -1597,15 +1596,11 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                        qk_job.n_dot_tiles    = DK / 32;
                        qk_job.n_tiles_per_bc = n_tiles_per_bc;
                        qk_job.hmx_scales     = factx.vtcm_hmx_scales_qk;
-                        TIMER_START(qk_dot);
                        hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_qk_dot_worker, &qk_job));

                        // DMA push next block (non-blocking, before worker_pool)
                        DMA_PREFETCH_KV(kv_blk + 1);
-
-                        TIMER_START(v_interleave);
                        fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc);
-                        TIMER_STOP(v_interleave);

                        // Pop and swap previous block's output update (deferred HMX pop)
                        if (kv_blk > 0) {
@@ -1615,7 +1610,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {

                        // Pop current block's dot product job
                        hmx_queue_pop(hmx_q);
-                        TIMER_STOP(qk_dot);

                        // ---- Phase 3: softmax(blk) + build_D(blk) | HMX idle ----
                        // Pop mask DMA before softmax (ensures VTCM buffer is ready)
@@ -1641,10 +1635,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                        sargs.mask_vtcm            = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL;
                        sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride;
                        sargs.slopes               = factx.vtcm_slopes;
-
-                        TIMER_START(softmax);
                        fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br);
-                        TIMER_STOP(softmax);

                        buf_idx = 1 - buf_idx;
                    }  // end KV block loop (pipeline)
@@ -1664,11 +1655,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                        ou_job.n_row_tiles_g_br = n_row_tiles_g_br;
                        ou_job.n_tiles_per_bc   = n_tiles_per_bc;
                        ou_job.DV               = DV;
-
-                        TIMER_START(o_update);
                        hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_o_update_worker, &ou_job));
                        hmx_queue_pop(hmx_q);
-                        TIMER_STOP(o_update);

                        hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev);
                    }
@@ -1683,23 +1671,14 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                        const uint32_t kv_start    = kv_blk * Bc;
                        const uint32_t kv_rows     = hex_smin(Bc, nek1 - kv_start);
                        const size_t   n_col_tiles = hmx_ceil_div(kv_rows, HMX_FP16_TILE_N_COLS);
-
-                        TIMER_START(kv_dma);
                        dma_queue_pop(dma);  // K
                        dma_queue_pop(dma);  // V
-                        TIMER_STOP(kv_dma);

                        bool has_mask_dma = false;
                        MASK_DMA_PUSH(kv_start, kv_rows, has_mask_dma);
                        DMA_PREFETCH_KV(kv_blk + 1);
-
-                        // K interleave (multi-thread HVX)
-                        TIMER_START(k_interleave);
                        fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx);
-                        TIMER_STOP(k_interleave);

-                        // QK dot (inline HMX on main thread)
-                        TIMER_START(qk_dot);
                        {
                            const size_t n_dot_tiles       = (size_t) (DK / 32);
                            const __fp16 * restrict q_base = factx.vtcm_q_tiles;
@@ -1709,6 +1688,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                            __builtin_assume(n_col_tiles > 0);
                            __builtin_assume(n_dot_tiles > 0);

+                            htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                            Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_qk);
                            for (size_t r = 0; r < n_row_tiles; ++r) {
                                for (size_t c = 0; c < n_col_tiles; ++c) {
@@ -1724,8 +1704,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                                    Q6_mxmem_AR_after_hf(out_tile, 0);
                                }
                            }
+                            htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                        }
-                        TIMER_STOP(qk_dot);

                        // Pop mask DMA
                        MASK_DMA_POP(has_mask_dma);
@@ -1751,21 +1731,9 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                        sargs.mask_vtcm            = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL;
                        sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride;
                        sargs.slopes               = factx.vtcm_slopes;
-
-                        TIMER_START(softmax);
                        fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br);
-                        TIMER_STOP(softmax);
-
-                        // V interleave (multi-thread HVX)
-                        TIMER_START(v_interleave);
-                        // FIX(v-stride): use n_tiles_per_bc (block-invariant) as V tile layout
-                        // stride to match o_update's v_tile access.  Using per-block n_col_tiles
-                        // misplaces DV_tile 1..3 in the last partial KV block.
                        fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc);
-                        TIMER_STOP(v_interleave);

-                        // O update (inline HMX on main thread)
-                        TIMER_START(o_update);
                        {
                            const size_t DV_tiles           = (size_t) (DV / 32);
                            const __fp16 * restrict d_base  = factx.vtcm_d_tiles;
@@ -1777,6 +1745,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                            __builtin_assume(n_col_tiles > 0);
                            __builtin_assume(DV_tiles > 0);

+                            htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                            Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_id);
                            for (size_t r = 0; r < n_row_tiles; ++r) {
                                for (size_t c = 0; c < DV_tiles; ++c) {
@@ -1798,16 +1767,15 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                                    Q6_mxmem_AR_after_hf(o_tile_out, 0);
                                }
                            }
+                            htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                            hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev);
                        }
-                        TIMER_STOP(o_update);

                        buf_idx = 1 - buf_idx;
                    }  // end KV block loop (fallback)
                }

                // ---- Final normalization: O = diag(1/l) @ O ----
-                TIMER_START(o_norm);
                {
                    fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br);

@@ -1830,6 +1798,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                        __builtin_assume(n_row_tiles > 0);
                        __builtin_assume(DV_tiles > 0);

+                        htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                        Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_id);
                        for (size_t r = 0; r < n_row_tiles; ++r) {
                            for (size_t c = 0; c < DV_tiles; ++c) {
@@ -1842,14 +1811,12 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                                Q6_mxmem_AR_after_hf(o_out, 0);
                            }
                        }
+                        htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                    }
                }
-                TIMER_STOP(o_norm);

                // ---- Store O block ----
-                TIMER_START(o_store);
                fa_phase_o_store(&factx, dst, o_tile_curr, q_start, kv_head, ib3, n_rows_g);
-                TIMER_STOP(o_store);

 #undef MASK_DMA_PUSH
 #undef MASK_DMA_POP
@@ -1865,14 +1832,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
        HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
    }

-    TIMER_STOP(total);

-#if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "hmx-fa: %lld us, q_load=%lld kv_dma=%lld k_interleave=%lld v_interleave=%lld", TIMER_US(total),
-         TIMER_US(q_load), TIMER_US(kv_dma), TIMER_US(k_interleave), TIMER_US(v_interleave));
-    FARF(HIGH, "  qk_dot=%lld softmax=%lld o_update=%lld o_norm=%lld o_store=%lld", TIMER_US(qk_dot), TIMER_US(softmax),
-         TIMER_US(o_update), TIMER_US(o_norm), TIMER_US(o_store));
-#endif

    return HTP_STATUS_OK;
 }
@@ -27,7 +27,7 @@
 #include "hmx-ops.h"
 #include "hmx-utils.h"
 #include "hmx-queue.h"
-#include "hmx-profile.h"
+#include "hex-profile.h"

 #include "vtcm-utils.h"

@@ -430,6 +430,7 @@ typedef struct {
    int                      n_tasks;
    int                      n_k_tiles;
    struct fastdiv_values    n_k_tiles_div;
+    struct htp_thread_trace * traces;
 } x4x2_dequantize_state_t;

 // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
@@ -533,11 +534,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(
                                                                                                               \
 static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) {                 \
    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;                                          \
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;                                   \
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);                                                 \
    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {                     \
        int start = task_id * state->n_tiles_per_task;                                                         \
        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);                             \
        dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end);                                 \
    }                                                                                                          \
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);                                                 \
 }

 DEFINE_DEQUANTIZE_Q4_TASK(q4_0,   q4_0_to_fp16_lut,   q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
@@ -657,11 +661,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(

 static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) {
    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
        int start = task_id * state->n_tiles_per_task;
        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
        dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end);
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
 }

 static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(
@@ -717,11 +724,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(

 static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) {
    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
        int start = task_id * state->n_tiles_per_task;
        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
        dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end);
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
 }

 static void convert_f16_weight_to_fp16_tiles_task(
@@ -773,11 +783,14 @@ static void convert_f16_weight_to_fp16_tiles_task(

 static void convert_f16_worker_loop(unsigned int n, unsigned int i, void *data) {
    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
        int start = task_id * state->n_tiles_per_task;
        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
        convert_f16_weight_to_fp16_tiles_task(state, start, end);
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
 }

 static void quantize_f32_weight_to_fp16_tiles_task(
@@ -833,11 +846,14 @@ static void quantize_f32_weight_to_fp16_tiles_task(

 static void quantize_f32_worker_loop(unsigned int n, unsigned int i, void *data) {
    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
        int start = task_id * state->n_tiles_per_task;
        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
        quantize_f32_weight_to_fp16_tiles_task(state, start, end);
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
 }


@@ -868,6 +884,7 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
    state.weight_type      = weight_type;
    state.n_k_tiles        = n_k_tiles;
    state.n_k_tiles_div    = n_k_tiles_div;
+    state.traces           = ctx ? ctx->trace : NULL;

    if (state.n_tasks == 1 || n_threads == 1) {
        dequant_worker_fn(1, 0, &state);
@@ -985,10 +1002,13 @@ typedef struct {
    int            n_chunks_per_task;
    int            n_cols;
    int            n;  // DDR row stride (total output columns)
+    struct htp_thread_trace * traces;
 } output_transfer_task_state_t;

 static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void *data) {
    output_transfer_task_state_t *st = (output_transfer_task_state_t *) data;
+    struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_O_PROC, i);

    for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) {
        int    chunk_idx  = task_id * st->n_chunks_per_task;
@@ -998,6 +1018,7 @@ static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void
        const __fp16 *vtcm_src = st->vtcm_src + chunk_idx * st->n_cols;
        transfer_output_chunk_fp16_to_fp32(dst, vtcm_src, chunk_size, st->n_cols, st->n);
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_O_PROC, i);
 }

 static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst, const __fp16 *vtcm_src,
@@ -1015,6 +1036,7 @@ static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst,
    state.vtcm_src          = vtcm_src;
    state.n_cols            = n_cols;
    state.n                 = n;
+    state.traces            = ctx ? ctx->trace : NULL;

    if (state.n_tasks == 1 || n_threads == 1) {
        transfer_output_chunk_worker_fn(1, 0, &state);
@@ -1086,10 +1108,13 @@ typedef struct {
    int          n_chunks_per_task;
    int          k_block;
    int          k_stride;
+    struct htp_thread_trace * traces;
 } activation_transfer_task_state_t;

 static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i, void *data) {
    activation_transfer_task_state_t *st = (activation_transfer_task_state_t *) data;
+    struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, i);

    for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) {
        // one chunk: one row
@@ -1100,6 +1125,7 @@ static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i,
        const float *src = st->src + chunk_idx * st->k_stride;
        transfer_activation_chunk_fp32_to_fp16(dst, src, chunk_size, st->k_block, st->k_stride);
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, i);
 }

 static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *dst, const float *src, int n_rows, int k_block, int k_stride, int n_threads) {
@@ -1117,6 +1143,7 @@ static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *
    state.src               = src;
    state.k_block           = k_block;
    state.k_stride          = k_stride;
+    state.traces            = ctx ? ctx->trace : NULL;

    if (state.n_tasks == 1 || n_threads == 1) {
        transfer_activation_chunk_worker_fn(1, 0, &state);
@@ -1245,13 +1272,7 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float
    FARF(HIGH, "hmx-mm-2d: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu",
         m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget);

-    TIMER_DEFINE(activation_load);
-    TIMER_DEFINE(weight_load);
-    TIMER_DEFINE(hmx_core);
-    TIMER_DEFINE(output_store);

-    TIMER_DEFINE(total);
-    TIMER_START(total);

    int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);

@@ -1370,7 +1391,12 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float
                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_scratch0, vtcm_weight, n_cols, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads);

                // C: HMX Compute (Synchronous)
-                core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS);
+                {
+                    struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL;
+                    htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
+                    core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS);
+                    htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
+                }

                // D: Output Store
                float *output_chunk = dst + (mr * n + nc);
@@ -1380,18 +1406,7 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float
        HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
    }

-    TIMER_STOP(total);

-#if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "hex-mm-2d: %lld us : m %d k %d n %d", TIMER_US(total), m, k, n);
-    if (!use_pipeline) {
-        FARF(HIGH, "  activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us",
-             TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store));
-        size_t weight_size = (size_t)n * row_stride;
-        float  bandwidth   = 1e-3f * weight_size / (float)TIMER_US(weight_load);
-        FARF(HIGH, "  weight load bandwidth: %.2f GB/s", bandwidth);
-    }
-#endif

    return 0;
 }
@@ -1523,13 +1538,7 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
            m_chunk_n_rows, n_chunk_n_cols,
            (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);

-    TIMER_DEFINE(activation_load);
-    TIMER_DEFINE(weight_load);
-    TIMER_DEFINE(hmx_core);
-    TIMER_DEFINE(output_store);
-    TIMER_DEFINE(total);

-    TIMER_START(total);

    const size_t fp16_row_bytes   = (size_t) params->k * sizeof(__fp16);
    const size_t weight_row_bytes = (size_t) params->weight_stride * sizeof(__fp16);
@@ -1549,7 +1558,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
                // contiguous rows into a VTCM scratch buffer first, then HVX
                // converts from the contiguous VTCM buffer.  This avoids L2 cache
                // thrashing from HVX loads at large strides.
-                TIMER_START(activation_load);
                for (int g = 0; g < group_size; ++g) {
                    const float *activation_chunk = hmx_matmul_activation_batch_ptr(params, b2_base + g, b3) + mr * params->act_stride;
                    __fp16 *vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride;
@@ -1569,7 +1577,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
                                                              params->k, params->act_stride, ctx->n_threads);
                    }
                }
-                TIMER_STOP(activation_load);

                void *buf_curr = vtcm_scratch0;
                void *buf_next = vtcm_scratch1;
@@ -1584,7 +1591,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
                    const size_t n_cols = hex_smin((size_t) params->n - nc, n_chunk_n_cols);
                    const size_t n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS);

-                    TIMER_START(weight_load);
                    {
                        dma_queue_pop(ctx->dma[0]);

@@ -1601,24 +1607,22 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
                                                 0, n_cols);
                        hex_swap_ptr(&buf_curr, &buf_next);
                    }
-                    TIMER_STOP(weight_load);

                    // Reuse the interleaved weight for every q_head in this GQA group
                    for (int g = 0; g < group_size; ++g) {
-                        TIMER_START(hmx_core);
                        {
                            const __fp16 * vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride;
+                            struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL;
+                            htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                            core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles,
                                                params->k / 32);
+                            htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                        }
-                        TIMER_STOP(hmx_core);

-                        TIMER_START(output_store);
                        {
                            float *output = hmx_matmul_dst_batch_ptr(params, b2_base + g, b3) + mr * params->dst_stride + nc;
                            transfer_output_chunk_threaded(ctx, output, vtcm_output, (int) n_rows, (int) n_cols, params->dst_stride, ctx->n_threads);
                        }
-                        TIMER_STOP(output_store);
                    }
                }
            }
@@ -1627,14 +1631,7 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32

    HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);

-    TIMER_STOP(total);

-#if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d group=%d", __func__, TIMER_US(total),
-         params->m, params->k, params->n, group_size);
-    FARF(HIGH, "  activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us",
-         TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store));
-#endif

    return 0;
 }
@@ -1668,6 +1665,7 @@ typedef struct {
    size_t                          nb12;
    int                             start_row;
    int                             cne1;
+    struct htp_thread_trace        *traces;
 } activation_transfer_gathered_task_state_t;

 typedef struct {
@@ -1684,6 +1682,7 @@ typedef struct {
    size_t                          dst_nb2;
    int                             start_row;
    int                             cne1;
+    struct htp_thread_trace        *traces;
 } output_transfer_scattered_task_state_t;

 static void transfer_activation_chunk_fp32_to_fp16_gathered(
@@ -1780,6 +1779,9 @@ static void transfer_activation_chunk_fp32_to_fp16_gathered(

 static void transfer_activation_chunk_gathered_worker_fn(unsigned int n, unsigned int i, void *data) {
    activation_transfer_gathered_task_state_t *st = data;
+    struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, i);
+
    int chunk_idx = i;
    int chunk_size = st->n_chunks_per_task;
    int start_row = st->start_row + chunk_idx * chunk_size;
@@ -1791,6 +1793,7 @@ static void transfer_activation_chunk_gathered_worker_fn(unsigned int n, unsigne
            st->matrix_rows, st->cur_a, st->mapping_stride,
            st->ne11, &st->ne11_div, st->nb11, st->nb12, st->cne1);
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, i);
 }

 static void transfer_activation_chunk_gathered_threaded(
@@ -1830,6 +1833,7 @@ static void transfer_activation_chunk_gathered_threaded(
        .nb12              = nb12,
        .start_row         = start_row,
        .cne1              = cne1,
+        .traces            = ctx ? ctx->trace : NULL,
    };

    if (actual_threads <= 1) {
@@ -1895,6 +1899,9 @@ static void transfer_output_chunk_fp16_to_fp32_scattered(

 static void transfer_output_chunk_scattered_worker_fn(unsigned int n, unsigned int i, void *data) {
    output_transfer_scattered_task_state_t *st = data;
+    struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_O_PROC, i);
+
    int chunk_idx = i;
    int chunk_size = st->n_chunks_per_task;
    int start_row = st->start_row + chunk_idx * chunk_size;
@@ -1906,6 +1913,7 @@ static void transfer_output_chunk_scattered_worker_fn(unsigned int n, unsigned i
            st->matrix_rows, st->cur_a, st->mapping_stride,
            st->dst_nb1, st->dst_nb2, st->cne1);
    }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_O_PROC, i);
 }

 static void transfer_output_chunk_scattered_threaded(
@@ -1942,6 +1950,7 @@ static void transfer_output_chunk_scattered_threaded(
        .dst_nb2           = dst_nb2,
        .start_row         = start_row,
        .cne1              = cne1,
+        .traces            = ctx ? ctx->trace : NULL,
    };

    if (actual_threads <= 1) {
@@ -2053,7 +2062,12 @@ int hmx_matmul_id_2d_f32(struct htp_context *ctx,

            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_scratch0, vtcm_weight, n_cols, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads);

-            core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS);
+            {
+                struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL;
+                htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
+                core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS);
+                htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
+            }

            transfer_output_chunk_scattered_threaded(
                ctx, dst, vtcm_output, (int) mr, (int) n_rows, (int) n_cols,
@@ -1,34 +0,0 @@
-// Conditional fine-grained profiling macros for HMX operations.
-//
-// Define ENABLE_PROFILE_TIMERS (via compiler flag or before including this
-// header) to instrument sub-operation latencies with HAP qtimer.  When the
-// macro is not defined the TIMER_* helpers expand to nothing so there is zero
-// overhead.
-//
-// Usage:
-//   TIMER_DEFINE(my_phase);          // declare accumulator variable
-//   TIMER_START(my_phase);           // snapshot start time
-//   ... work ...
-//   TIMER_STOP(my_phase);            // accumulate elapsed ticks
-//   FARF(ALWAYS, "my_phase: %lld us", TIMER_US(my_phase));
-
-#ifndef HMX_PROFILE_H
-#define HMX_PROFILE_H
-
-#include <HAP_perf.h>
-
-// #define ENABLE_PROFILE_TIMERS
-
-#if defined(ENABLE_PROFILE_TIMERS)
-#  define TIMER_DEFINE(name) int64_t name##_ticks = 0
-#  define TIMER_START(name)  int64_t name##_t0 = HAP_perf_get_qtimer_count()
-#  define TIMER_STOP(name)   name##_ticks += HAP_perf_get_qtimer_count() - name##_t0
-#  define TIMER_US(name)     HAP_perf_qtimer_count_to_us(name##_ticks)
-#else
-#  define TIMER_DEFINE(name)
-#  define TIMER_START(name)
-#  define TIMER_STOP(name)
-#  define TIMER_US(name)     0LL
-#endif
-
-#endif // HMX_PROFILE_H
@@ -44,7 +44,9 @@ static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) {
                case HMX_QUEUE_SUSPEND: hmx_unlock(q);  break;
                default:
                    hmx_lock(q);
+                    htp_trace_event_start(q->trace, HTP_TRACE_EVT_HMX_COMP, ir);
                    d->func(d->data);
+                    htp_trace_event_stop(q->trace, HTP_TRACE_EVT_HMX_COMP, ir);
                    break;
            }

@@ -11,6 +11,7 @@
 #include <HAP_farf.h>

 #include "hex-utils.h"
+#include "hex-profile.h"

 #ifdef __cplusplus
 extern "C" {
@@ -47,6 +48,7 @@ struct hmx_queue {
    void *           stack;
    uint32_t         hap_rctx;
    bool             hmx_locked;
+    struct htp_thread_trace * trace;
 };

 struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx);
@@ -4,6 +4,7 @@
 #include "hex-dma.h"
 #include "hmx-queue.h"
 #include "htp-ops.h"
+#include "hex-profile.h"
 #include "worker-pool.h"

 #include <assert.h>
@@ -70,6 +71,7 @@ struct htp_context {
    bool                   hmx_enabled;
    bool                   etm;
    uint32_t               profiler;
+    struct htp_thread_trace trace[HTP_MAX_NTHREADS + 1];

    uint8_t *              vtcm_base;
    size_t                 vtcm_size;
@@ -146,10 +146,36 @@ struct htp_op_desc {
    uint16_t dst;                       // Output tensor index
 };

+#ifndef HTP_MAX_NTHREADS
+#define HTP_MAX_NTHREADS 10
+#endif
+
+#define HTP_TRACE_MAX_EVENTS 256
+
 enum htp_profiler_mode {
    HTP_PROF_DISABLED = 0,
    HTP_PROF_BASIC    = 1,
    HTP_PROF_PMU      = 2,
+    HTP_PROF_TRACE    = 3,
+};
+
+enum htp_trace_event_id {
+    HTP_TRACE_EVT_DMA                 = 0,
+
+    HTP_TRACE_EVT_HVX_COMP            = 20,
+    HTP_TRACE_EVT_HVX_A_QUANT         = 21,
+    HTP_TRACE_EVT_HVX_A_PREP          = 22,
+    HTP_TRACE_EVT_HVX_W_DEQUANT       = 23,
+    HTP_TRACE_EVT_HVX_W_PREP          = 24,
+    HTP_TRACE_EVT_HVX_O_PROC          = 25,
+
+    HTP_TRACE_EVT_HMX_COMP            = 40,
+};
+
+struct htp_trace_desc {
+    uint32_t cycles;  // lower 32-bits of cycle counter
+    uint16_t id;      // Event ID
+    uint16_t info;    // bit 15: is_stop. bits 14-0: tile/chunk index or other metadata.
 };

 #define HTP_PROF_PMU_NCNT 8
@@ -158,8 +184,8 @@ enum htp_profiler_mode {
 struct htp_prof_desc {
    uint32_t opcode;                 // GGML/HTP Op
    uint32_t usecs;                  // Number of usec
-    uint32_t cycles;                 // Number of cycles
-    uint32_t pad;                    // Unused
+    uint32_t cycles_start;           // Start cycle counter
+    uint32_t cycles_stop;            // Stop cycle counter
    uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
 };

@@ -168,7 +194,7 @@ struct htp_opbatch_req {
    uint32_t n_bufs;      // Number of buffers
    uint32_t n_tensors;   // Number of tensors
    uint32_t n_ops;       // Number of ops
-    uint32_t flags;       // unused
+    uint32_t n_traces;    // Number of trace descriptors per thread
    uint32_t pad;         // unused
    // struct htp_buf_desc  bufs[];    -- dspqueue buf 0
    // struct htp_tensor    tensors[]; -- dspqueue buf 0
@@ -181,7 +207,8 @@ struct htp_opbatch_rsp {
    uint32_t n_bufs;     // Number of buffers
    uint32_t n_tensors;  // Number of tensors
    uint32_t n_ops;      // Number of op profile descriptors
-    uint32_t pad;        // unused
+    uint32_t n_traces[HTP_MAX_NTHREADS + 1];
+    uint8_t  pad[8];     // align to 8 bytes
    // struct htp_prof_desc profs[];  -- dspqueue buf 0
 };

@@ -400,7 +400,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
    ctx->hmx_queue   = NULL;
    if (use_hmx) {
        ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
-        if (!ctx->hmx_queue) {
+        if (ctx->hmx_queue) {
+            ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS];
+        } else {
            FARF(ERROR, "hmx-queue-create failed");
            ctx->hmx_enabled = false;
        }
@@ -425,6 +427,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
    ctx->n_threads = n_hvx;
    for (int i = 0; i < ctx->n_threads; i++) {
        ctx->dma[i] = dma_queue_create(256); // queue depth
+        if (ctx->dma[i]) {
+            ctx->dma[i]->trace = &ctx->trace[i];
+        }
    }

    ctx->ddr_spad_size = 512 * 1024; // 512 KB
@@ -502,7 +507,8 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {

 struct profile_data {
    uint64_t usecs;
-    uint64_t cycles;
+    uint64_t cycles_start;
+    uint64_t cycles_stop;
    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
 };

@@ -512,8 +518,9 @@ static inline void profile_start(uint32_t mode, struct profile_data * d) {
            hex_get_pmu(d->pmu_counters);
            // fallthrough
        case HTP_PROF_BASIC:
+        case HTP_PROF_TRACE:
            d->usecs  = HAP_perf_get_qtimer_count();
-            d->cycles = hex_get_cycles();
+            d->cycles_start = hex_get_cycles();
            break;
        default:
            break;
@@ -530,8 +537,9 @@ static inline void profile_stop(uint32_t mode, struct profile_data * d) {
            }
            // fallthrough
        case HTP_PROF_BASIC:
+        case HTP_PROF_TRACE:
            d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
-            d->cycles = hex_get_cycles() - d->cycles;
+            d->cycles_stop = hex_get_cycles();
            break;
        default:
            break;
@@ -845,14 +853,15 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
        const uint32_t t_size = sizeof(struct htp_tensor)    * n_tens;
        const uint32_t o_size = sizeof(struct htp_op_desc)   * n_ops;
        const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
+        const uint32_t tr_size = (HTP_MAX_NTHREADS + 1) * req.n_traces * sizeof(struct htp_trace_desc);

-        if (dbuf.size < b_size + t_size + o_size + p_size) {
-            FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
+        if (dbuf.size < b_size + t_size + o_size + p_size + tr_size) {
+            FARF(ERROR, "invalid opbatch memory block size %u (req %u)", dbuf.size, b_size + t_size + o_size + p_size + tr_size);
            break;
        }

-        FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
-                n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
+        FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u n-traces %u : m-size %u b-size %u t-size %u o-size %u", req.id,
+                n_bufs, n_tens, n_ops, req.n_traces, dbuf.size, b_size, t_size, o_size);

        // Setup descriptor pointers
        uint8_t * m_ptr = dbuf.ptr;
@@ -869,6 +878,20 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
        octx->n_threads = ctx->n_threads;
        octx->ctx       = ctx;

+        if (ctx->profiler == HTP_PROF_TRACE) {
+            memset(ctx->trace, 0, sizeof(ctx->trace));
+            struct htp_trace_desc * trace_events = (struct htp_trace_desc *) (m_ptr + p_size);
+            for (int t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                ctx->trace[t].events = &trace_events[t * req.n_traces];
+                ctx->trace[t].max_events = req.n_traces;
+            }
+        } else {
+            for (int t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                ctx->trace[t].events = NULL;
+                ctx->trace[t].max_events = 0;
+            }
+        }
+
        for (uint32_t i=0; i < n_ops; i++) {
            struct profile_data prof;

@@ -886,7 +909,8 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
            if (ctx->profiler) {
                pds[i].opcode = ops[i].opcode;
                pds[i].usecs  = prof.usecs;
-                pds[i].cycles = prof.cycles;
+                pds[i].cycles_start = prof.cycles_start;
+                pds[i].cycles_stop = prof.cycles_stop;
                for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
                    pds[i].pmu[j] = prof.pmu_counters[j];
                }
@@ -899,6 +923,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
        rsp.n_bufs    = n_bufs;
        rsp.n_tensors = n_tens;
        rsp.n_ops     = n_ops;
+        memset(rsp.pad, 0, sizeof(rsp.pad));
+        if (ctx->profiler == HTP_PROF_TRACE) {
+            for (int t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                rsp.n_traces[t] = ctx->trace[t].count;
+            }
+        } else {
+            memset(rsp.n_traces, 0, sizeof(rsp.n_traces));
+        }

        dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;

@@ -3350,6 +3350,7 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void *

 static void matmul_4d(unsigned int nth, unsigned int ith, void * data) {
    htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
@@ -3411,10 +3412,12 @@ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) {
                float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));

                const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
+                htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, iir0);
                for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
                    const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
                    mmctx->vec_dot_1x1(ne00, &dst_col[ir0], src0_row, src1_col);
                }
+                htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, iir0);
            }
        }
    }
@@ -3430,6 +3433,7 @@ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) {
 // src1 tensor is already in VTCM spad
 static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
    htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src1_nrows = ne11 * ne12 * ne13;  // src1 rows
@@ -3477,6 +3481,8 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;

+        htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
+
        // Process src1 columns in pairs (2×2 tiling)
        uint32_t ir1 = 0;
        for (; ir1 + 1 < src1_nrows; ir1 += 2) {
@@ -3494,6 +3500,8 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
            mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_stride, src1_col);
        }

+        htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
+
        // Prefetch next (n + spad_nrows) row
        const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
        const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
@@ -3511,12 +3519,14 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
                       src0_stride, src0_row_size, 1);
        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;

+        htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
        #pragma unroll(2)
        for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
            const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride);
            float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
            mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
        }
+        htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
    }

    t2 = HAP_perf_get_qtimer_count();
@@ -3530,6 +3540,7 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
 // q8x4x2 src1 tensor is already in VTCM spad
 static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
    htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const uint32_t src0_nrows = ne01;

@@ -3581,7 +3592,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
        // Process src0 rows
        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) {
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            mmctx->vec_dot_4x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, ss0 + 2 * src0_stride, ss0 + 3 * src0_stride, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);

            // Prefetch next (n + spad_nrows) row
            const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -3599,7 +3612,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
                           src0_stride, src0_row_size, 2);
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            ir0 += 2;
        }
        if (ir0 < src0_end_row) {
@@ -3607,7 +3622,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
                           src0_stride, src0_row_size, 1);
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            ir0 += 1;
        }
    } else {
@@ -3627,7 +3644,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
        // Process src0 rows
        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);

            // Prefetch next (n + spad_nrows) row
            const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -3645,7 +3664,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
                           src0_stride, src0_row_size, 1);
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
        }
    }

@@ -3669,6 +3690,7 @@ struct mmid_row_mapping {
 // src1 tensor is already in VTCM spad
 static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
    htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const struct htp_tensor * restrict ids = octx->src[2];
    struct htp_spad * restrict   src2_spad = &octx->src2_spad;
@@ -3735,6 +3757,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;

+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            for (uint32_t cid = 0; cid < cne1; ++cid) {
                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
                const int               rm1         = row_mapping.i1;  // expert idx
@@ -3746,6 +3769,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {

                mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col);
            }
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);

            // Prefetch next (n + spad_nrows) row
            const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -3764,6 +3788,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
                           src0_row_size_padded, src0_row_size, 1);
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;

+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            for (uint32_t cid = 0; cid < cne1; ++cid) {
                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
                const int               rm1         = row_mapping.i1;  // expert idx
@@ -3775,6 +3800,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {

                mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
            }
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
        }
    }

@@ -3789,6 +3815,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
 // src1 tensor is already in VTCM spad
 static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
    htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const struct htp_tensor * restrict ids = octx->src[2];
    struct htp_spad * restrict   src2_spad = &octx->src2_spad;
@@ -3847,7 +3874,9 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
        // Process src0 rows
        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);

            // Prefetch next (n + spad_nrows) row
            const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -3865,7 +3894,9 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                           src0_row_size_padded, src0_row_size, 1);
            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
            mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
        }
    }

@@ -4147,6 +4178,7 @@ static void quantize_row_f32_q8x4x2(float * restrict x, uint8_t * restrict y, ui
 static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) {
    struct htp_matmul_context * mmctx = data;
    struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const struct htp_tensor * src = octx->src[1];
    uint8_t * restrict dst = octx->src1_spad.data;
@@ -4163,6 +4195,7 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data)
    const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows

    const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
    const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row

    const size_t src_row_size = src->nb[1];
@@ -4189,6 +4222,7 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data)

    FARF(HIGH, "quantize-f32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
         ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }

 static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) {
@@ -4219,6 +4253,7 @@ static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y,
 static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * data) {
    struct htp_matmul_context * mmctx = data;
    struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const struct htp_tensor * src = octx->src[1];
    uint8_t * restrict dst = octx->src1_spad.data;
@@ -4235,6 +4270,7 @@ static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * dat
    const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows

    const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
    const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row

    const size_t src_row_size = src->nb[1];
@@ -4260,11 +4296,13 @@ static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * dat

    FARF(HIGH, "quantize-f32-q8_1x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
         ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }

 static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) {
    struct htp_matmul_context * mmctx = data;
    struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const struct htp_tensor * src = octx->src[1];
    uint8_t * restrict dst = octx->src1_spad.data;
@@ -4281,6 +4319,7 @@ static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) {
    const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows

    const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
    const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row

    const size_t src_row_size = ne0 * sizeof(float);
@@ -4301,11 +4340,13 @@ static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) {

    FARF(HIGH, "quantize-f32-f32: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
        ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }

 static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
    struct htp_matmul_context * mmctx = data;
    struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const struct htp_tensor * src = octx->src[1];
    uint8_t * restrict dst = octx->src1_spad.data;
@@ -4322,6 +4363,7 @@ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
    const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows

    const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
    const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row

    const size_t src_row_size = ne0 * sizeof(float);
@@ -4342,12 +4384,14 @@ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {

    FARF(HIGH, "quantize-f32-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
        ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }

 // TODO just a plain copy that should be done via the DMA during the Op setup
 static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {
    struct htp_matmul_context * mmctx = data;
    struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;

    const struct htp_tensor * src = octx->src[1];
    uint8_t * restrict dst = octx->src1_spad.data;
@@ -4364,6 +4408,7 @@ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {
    const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows

    const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
    const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row

    const size_t src_row_size = ne0 * sizeof(float);
@@ -4384,6 +4429,7 @@ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {

    FARF(HIGH, "quantize-f16-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
        ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }


@@ -66,7 +66,6 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_base(ggml
    const char * op_str = "undefined";
    switch (op) {
        case GGML_OP_ADD_ID: op_str = "add_id"; break;
-        case GGML_OP_CONCAT: op_str = "concat"; break;
        default: GGML_ABORT("fatal error");
    };

@@ -211,6 +210,21 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat(ggml_meta
    return res;
 }

+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat(ggml_metal_library_t lib, ggml_type tsrc) {
+    char base[256];
+    char name[256];
+
+    snprintf(base, 256, "kernel_concat_%s", ggml_type_name(tsrc));
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary(ggml_metal_library_t lib, const ggml_tensor * op) {
    char base[256];
    char name[256];
@@ -1689,7 +1703,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm(ggml_metal_
 }

 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_library_t lib, const ggml_tensor * op) {
-    assert(op->op == GGML_OP_ROPE);
+    assert(op->op == GGML_OP_ROPE || op->op == GGML_OP_ROPE_BACK);
+
+    const bool is_back = op->op == GGML_OP_ROPE_BACK;

    char base[256];
    char name[256];
@@ -1713,13 +1729,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_
        snprintf(base, 256, "kernel_rope_norm_%s", ggml_type_name(op->src[0]->type));
    }

-    snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0);
+    snprintf(name, 256, "%s_imrope=%d_is_back=%d", base, is_imrope ? 1 : 0, is_back ? 1 : 0);

    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
    if (!res.pipeline) {
        ggml_metal_cv_t cv = ggml_metal_cv_init();

        ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0);
+        ggml_metal_cv_set_bool(cv, is_back,   FC_ROPE + 1);

        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);

@@ -115,6 +115,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows          (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat            (ggml_metal_library_t lib, enum ggml_type tsrc);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat            (ggml_metal_library_t lib, enum ggml_type tsrc);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary             (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu               (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum               (ggml_metal_library_t lib, const struct ggml_tensor * op);
@@ -1123,13 +1123,24 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
            return true;
        case GGML_OP_CONCAT:
            {
-                // kernel_concat copies one float-sized value per element.
-                // Other scalar types need a type-generic copy kernel first.
                const enum ggml_type src0_type = op->src[0]->type;
                const enum ggml_type src1_type = op->src[1]->type;
-                return src0_type == src1_type &&
-                       src0_type == op->type &&
-                       (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_I32);
+                if (src0_type != src1_type || src0_type != op->type) {
+                    return false;
+                }
+                switch (src0_type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_I8:
+                    case GGML_TYPE_I16:
+                    case GGML_TYPE_I32:
+                    case GGML_TYPE_I64:
+                        return true;
+                    case GGML_TYPE_BF16:
+                        return has_bfloat;
+                    default:
+                        return false;
+                }
            }
        case GGML_OP_ADD:
        case GGML_OP_SUB:
@@ -1173,6 +1184,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
        case GGML_OP_RMS_NORM:
            return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
            return true;
        case GGML_OP_IM2COL:
            return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
@@ -375,6 +375,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
                n_fuse = ggml_metal_op_norm(ctx, idx);
            } break;
        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
            {
                n_fuse = ggml_metal_op_rope(ctx, idx);
            } break;
@@ -556,7 +557,7 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
        /*.dim  =*/ dim,
    };

-    auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
+    auto pipeline = ggml_metal_library_get_pipeline_concat(lib, op->type);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -1418,6 +1418,9 @@ typedef decltype(kernel_repeat<float>) kernel_repeat_t;

 template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
 template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
+#if defined(GGML_METAL_HAS_BF16)
+template [[host_name("kernel_repeat_bf16")]] kernel kernel_repeat_t kernel_repeat<bfloat>;
+#endif
 template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
 template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;

@@ -4355,6 +4358,7 @@ template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_
 #endif

 constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]];
+constant bool FC_rope_is_back   [[function_constant(FC_ROPE + 1)]];

 static float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
@@ -4378,6 +4382,9 @@ static void rope_yarn(
    }
    *cos_theta = cos(theta) * mscale;
    *sin_theta = sin(theta) * mscale;
+    if (FC_rope_is_back) {
+        *sin_theta *= -1.0f;
+    }
 }

 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
@@ -7510,14 +7517,15 @@ template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<
 template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
 template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;

+template<typename T>
 kernel void kernel_concat(
-    constant ggml_metal_kargs_concat & args,
-    device  const char * src0,
-    device  const char * src1,
-    device        char * dst,
-    uint3   tgpig[[threadgroup_position_in_grid]],
-    ushort3 tpitg[[thread_position_in_threadgroup]],
-    ushort3   ntg[[threads_per_threadgroup]]) {
+        constant ggml_metal_kargs_concat & args,
+        device  const char * src0,
+        device  const char * src1,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {

    const int i3 = tgpig.z;
    const int i2 = tgpig.y;
@@ -7530,21 +7538,33 @@ kernel void kernel_concat(
    int o[4] = {0, 0, 0, 0};
    o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));

-    device const float * x;
-
    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        device const T * x;
+
        if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
-            x = (device const float *)(src0 + (i3       )*args.nb03 + (i2       )*args.nb02 + (i1       )*args.nb01 + (i0       )*args.nb00);
+            x = (device const T *)(src0 + (i3       )*args.nb03 + (i2       )*args.nb02 + (i1       )*args.nb01 + (i0       )*args.nb00);
        } else {
-            x = (device const float *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
+            x = (device const T *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
        }

-        device float * y = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+        device T * y = (device T *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);

        *y = *x;
    }
 }

+typedef decltype(kernel_concat<float>) kernel_concat_t;
+
+template [[host_name("kernel_concat_f32")]]  kernel kernel_concat_t kernel_concat<float>;
+template [[host_name("kernel_concat_f16")]]  kernel kernel_concat_t kernel_concat<half>;
+#if defined(GGML_METAL_HAS_BF16)
+template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat<bfloat>;
+#endif
+template [[host_name("kernel_concat_i8")]]   kernel kernel_concat_t kernel_concat<char>;
+template [[host_name("kernel_concat_i16")]]  kernel kernel_concat_t kernel_concat<short>;
+template [[host_name("kernel_concat_i32")]]  kernel kernel_concat_t kernel_concat<int>;
+template [[host_name("kernel_concat_i64")]]  kernel kernel_concat_t kernel_concat<long>;
+
 template<int nr0, typename args_t>
 void kernel_mul_mv_q2_K_f32_impl(
        args_t args,
@@ -564,6 +564,9 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_mul_mat_f16_f32_1row;
    cl_kernel kernel_mul_mat_f16_f32;
    cl_kernel kernel_mul_mat_f16_f32_l4;
+    cl_kernel kernel_mul_mat_f16_f32_l4_dr;
+    cl_kernel kernel_mul_mat_f16_f32_l4_dr_ls;
+    cl_kernel kernel_mul_mat_f16_f32_l4_dr_lq;
    cl_kernel kernel_mul_mat_f16_f32_tiled;
    cl_kernel kernel_adreno_xmem_pack_src_f32;
    cl_kernel kernel_adreno_xmem_prepack_weight_f16;
@@ -1787,6 +1790,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4   = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
+        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr", &err), err));
+        if (backend_ctx->gpu_family == ADRENO) {
+            CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_ls", &err), err));
+            CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_lq", &err), err));
+        }
        GGML_LOG_CONT(".");
    }

@@ -14570,11 +14578,31 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
            }

            if (src1t == GGML_TYPE_F32) {
+                // heuristic for packing more work for Adreno
+                const bool adreno_use_lane_split =
+                    backend_ctx->gpu_family == ADRENO &&
+                    ne11 == 1 &&
+                    ne01 >= 8 &&
+                    ne00 % 4 == 0 &&
+                    r3 == 1 && r2 >= 1 && r2 <= 8 &&
+                    (ne12 % r2) == 0;
+
                if (ne11 * ne12 < 4) {
                    kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
+                } else if (adreno_use_lane_split && ne00 >= 64 && ne00 <= 128) {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq;
+                    nrows  = 1;
+                } else if (adreno_use_lane_split && r2 >= 2 && ne00 > 128 && ne00 <= 256) {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls;
+                    nrows  = 1;
                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
-                    nrows = ne11;
+                    if (ne11 == 1) {
+                        kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr;
+                        nrows  = 1; // not used by this kernel
+                    } else {
+                        kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
+                        nrows  = ne11;
+                    }
                } else {
                    kernel = backend_ctx->kernel_mul_mat_f16_f32;
                    nrows = 4;
@@ -15353,12 +15381,30 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co

        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
    } else {
-        int64_t ny = (ne11 + nrows - 1)/nrows;
+        if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr) {
+            const int NDST_DR = 4;
+            size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, NDST_DR)*nth0, (size_t)nth1, (size_t)ne12*ne13};
+            size_t local_work_size[]  = {(size_t)nth0, (size_t)nth1, 1};

-        size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
-        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        } else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls) {
+            size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 2)*nth0, (size_t)nth1, (size_t)ne02*ne03};
+            size_t local_work_size[]  = {(size_t)nth0, (size_t)nth1, 1};

-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        } else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq) {
+            size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 4)*nth0, (size_t)nth1, (size_t)ne02*ne03};
+            size_t local_work_size[]  = {(size_t)nth0, (size_t)nth1, 1};
+
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        } else {
+            int64_t ny = (ne11 + nrows - 1)/nrows;
+
+            size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
+            size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        }
    }
 }

@@ -82,3 +82,299 @@ kernel void kernel_mul_mat_f16_f32_l4(
        }
    }
 }
+
+// Each subgroup produces DR_NDST outputs, assumes ne11 == 1
+#define MUL_MAT_F16_F32_L4_DR_NDST 4
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_f16_f32_l4_dr(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst  = (global float*)((global char*)dst  + offsetd);
+
+    const int r0_base = get_group_id(0) * MUL_MAT_F16_F32_L4_DR_NDST;
+    const int im      = get_group_id(2);
+
+    const int i12 = im % ne12;
+    const int i13 = im / ne12;
+
+    // assume ne11 == 1
+    const ulong offset_src1 = i12*nb12 + i13*nb13;
+    global float4 * y4 = (global float4 *)(src1 + offset_src1);
+
+    global half4 * x4[MUL_MAT_F16_F32_L4_DR_NDST];
+    float          sumf[MUL_MAT_F16_F32_L4_DR_NDST];
+
+    const ulong   k_head_off = (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    #pragma unroll
+    for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
+        int       r0   = r0_base + n;
+        int       r0c  = r0 < ne01 ? r0 : 0;
+        ulong     off  = (ulong)r0c*nb01 + k_head_off;
+        x4[n]   = (global half4 *)(src0 + off);
+        sumf[n] = 0.0f;
+    }
+
+    const int n_chunks = ne00 / 4;
+    const int sg_size  = get_max_sub_group_size();
+    const int lid      = get_sub_group_local_id();
+
+    for (int i = lid; i < n_chunks; i += sg_size) {
+        float4 q = y4[i];
+        #pragma unroll
+        for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
+            float4 k = convert_float4(x4[n][i]);
+            sumf[n] = mad(k.s0, q.s0, sumf[n]);
+            sumf[n] = mad(k.s1, q.s1, sumf[n]);
+            sumf[n] = mad(k.s2, q.s2, sumf[n]);
+            sumf[n] = mad(k.s3, q.s3, sumf[n]);
+        }
+    }
+
+    #pragma unroll
+    for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
+        float reduced = sub_group_reduce_add(sumf[n]);
+        int   r0      = r0_base + n;
+        if (lid == 0 && r0 < ne01) {
+            dst[im*ne1*ne0 + r0] = reduced;
+        }
+    }
+}
+
+// Kernels for decoding, Adreno only for now
+#define MUL_MAT_F16_F32_L4_DR_LS_R2_MAX 8
+
+#ifdef ADRENO_GPU
+#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
+#define sub_group_shuffle_xor(val, mask) qcom_sub_group_shuffle_xor((val), (mask), CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.0f)
+
+REQD_SUBGROUP_SIZE_64
+kernel void kernel_mul_mat_f16_f32_l4_dr_ls(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst  = (global float*)((global char*)dst  + offsetd);
+
+    const int r0_base = get_group_id(0) * 2;
+    const int kv_grp  = get_group_id(2);   // KV head group; im = kv_grp*r2 + q
+
+    const int i12_kv = kv_grp % ne02;
+    const int i13_kv = kv_grp / ne02;
+
+    const int lid     = get_sub_group_local_id();
+    const int subhalf = lid >> 5;          // 0 or 1 (which K row in the WG)
+    const int intra   = lid & 31;          // 0..31 (lane within the half)
+
+    const int r0  = r0_base + subhalf;
+    const int r0c = r0 < ne01 ? r0 : 0;    // clamp OOB to row 0; skip write below
+
+    // K row pointer for this lane (one K row per half-wave).
+    const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03;
+    global half4 * x4 = (global half4 *)(src0 + k_off);
+
+    global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
+    #pragma unroll
+    for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+        const int i12_q = i12_kv*r2 + q;
+        const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13;
+        y4[q] = (global float4 *)(src1 + q_off);
+    }
+
+    float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
+    #pragma unroll
+    for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+        partial[q] = 0.0f;
+    }
+
+    const int n_chunks = ne00 / 4;
+
+    for (int i = intra; i < n_chunks; i += 32) {
+        float4 k = convert_float4(x4[i]);
+
+        #pragma unroll
+        for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+            if (q < r2) {
+                float4 v = y4[q][i];
+                partial[q] = mad(k.s0, v.s0, partial[q]);
+                partial[q] = mad(k.s1, v.s1, partial[q]);
+                partial[q] = mad(k.s2, v.s2, partial[q]);
+                partial[q] = mad(k.s3, v.s3, partial[q]);
+            }
+        }
+    }
+
+    // half-wave reduction
+    #pragma unroll
+    for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+        if (q < r2) {
+            partial[q] += sub_group_shuffle_xor(partial[q],  1u);
+            partial[q] += sub_group_shuffle_xor(partial[q],  2u);
+            partial[q] += sub_group_shuffle_xor(partial[q],  4u);
+            partial[q] += sub_group_shuffle_xor(partial[q],  8u);
+            partial[q] += sub_group_shuffle_xor(partial[q], 16u);
+        }
+    }
+
+    if (intra == 0 && r0 < ne01) {
+        #pragma unroll
+        for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+            if (q < r2) {
+                const int im = i12_kv*r2 + q + i13_kv*ne12;
+                dst[im*ne1*ne0 + r0] = partial[q];
+            }
+        }
+    }
+}
+
+REQD_SUBGROUP_SIZE_64
+kernel void kernel_mul_mat_f16_f32_l4_dr_lq(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst  = (global float*)((global char*)dst  + offsetd);
+
+    const int r0_base = get_group_id(0) * 4;
+    const int kv_grp  = get_group_id(2);
+
+    const int i12_kv = kv_grp % ne02;
+    const int i13_kv = kv_grp / ne02;
+
+    const int lid   = get_sub_group_local_id();
+    const int subq  = lid >> 4;            // 0..3 (which K row)
+    const int intra = lid & 15;            // 0..15 (lane within quarter)
+
+    const int r0  = r0_base + subq;
+    const int r0c = r0 < ne01 ? r0 : 0;
+
+    const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03;
+    global half4 * x4 = (global half4 *)(src0 + k_off);
+
+    global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
+    #pragma unroll
+    for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+        const int i12_q = i12_kv*r2 + q;
+        const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13;
+        y4[q] = (global float4 *)(src1 + q_off);
+    }
+
+    float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
+    #pragma unroll
+    for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+        partial[q] = 0.0f;
+    }
+
+    const int n_chunks = ne00 / 4;
+
+    for (int i = intra; i < n_chunks; i += 16) {
+        float4 k = convert_float4(x4[i]);
+
+        #pragma unroll
+        for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+            if (q < r2) {
+                float4 v = y4[q][i];
+                partial[q] = mad(k.s0, v.s0, partial[q]);
+                partial[q] = mad(k.s1, v.s1, partial[q]);
+                partial[q] = mad(k.s2, v.s2, partial[q]);
+                partial[q] = mad(k.s3, v.s3, partial[q]);
+            }
+        }
+    }
+
+    // quarter-wave reduction
+    #pragma unroll
+    for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+        if (q < r2) {
+            partial[q] += sub_group_shuffle_xor(partial[q], 1u);
+            partial[q] += sub_group_shuffle_xor(partial[q], 2u);
+            partial[q] += sub_group_shuffle_xor(partial[q], 4u);
+            partial[q] += sub_group_shuffle_xor(partial[q], 8u);
+        }
+    }
+
+    if (intra == 0 && r0 < ne01) {
+        #pragma unroll
+        for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
+            if (q < r2) {
+                const int im = i12_kv*r2 + q + i13_kv*ne12;
+                dst[im*ne1*ne0 + r0] = partial[q];
+            }
+        }
+    }
+}
+#endif // ADRENO_GPU
@@ -2,12 +2,7 @@
 # Override root .clang-format
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
-Cpp11BracedListStyle: true
-SpacesInContainerLiterals: false
-BreakBeforeBraces: Attach
 AccessModifierOffset: -4
-IndentCaseBlocks: false
-IndentCaseLabels: false

 Language:        Cpp
 AlignAfterOpenBracket: Align
@@ -1,8 +1,6 @@
-find_package(OpenVINO REQUIRED)
+find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading)
 find_package(OpenCL REQUIRED)

-include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
-
 file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp")
 file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp")

@@ -11,7 +9,7 @@ ggml_add_backend_library(ggml-openvino
    ${GGML_HEADERS_OPENVINO}
 )

-target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
+target_link_libraries(ggml-openvino PRIVATE openvino::runtime openvino::threading OpenCL::OpenCL)

 if (GGML_OPENVINO)
    if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
@@ -1,6 +1,7 @@
 #pragma once

-#include "ggml-quants.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
 #include "ggml.h"
 #include "openvino/decoder.h"

@@ -14,21 +15,21 @@

 struct ModelParams {
    int ctx = -1;
-    int ctx_swa = -1;
    int ctx_per_seq = -1;
    int ctx_per_seq_swa = -1;
    int n_seq = 1;
-    int n_heads = -1;
    int n_heads_kv = -1;
    int head_size = -1;
    int32_t rope_params[15];
+    bool mixed_rope_params = false;
    std::vector<int> swa_layers;

    std::vector<std::string> kv_names;
    size_t kv_buffer_ctx_id = 0;

    bool same_rope_params(const ModelParams & other) const {
-        return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
+        return mixed_rope_params == other.mixed_rope_params &&
+               memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
    }

    bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); }
@@ -56,12 +57,14 @@ public:
        std::string node_name;
        std::string node_op_type;
        std::map<std::string, ggml_tensor *> node_inputs;
+        std::map<std::string, std::vector<std::pair<std::string, ggml_tensor *>>> node_inputs_views;
        std::vector<std::string> node_inputs_names;
        ggml_tensor * node_output;
        std::string node_output_name;
        int node_op_case = 0;
        void * data_addr;
    };
+
    // Graph decoder
    GgmlOvDecoder(ggml_cgraph * cgraph,
                  ModelParams & model_params,
@@ -69,6 +72,7 @@ public:
                  std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
                  bool is_static,
                  bool is_stateful = false,
+                  bool model_is_splitted = false,
                  bool is_prefill = false,
                  int prefill_chunk_size = 256);

@@ -84,6 +88,42 @@ public:

    virtual std::vector<size_t> get_input_stride(int node_idx, const std::string & name) const override;

+    virtual size_t get_view_input_size(int node_idx, const std::string & name) const override;
+
+    virtual size_t get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const override;
+
+    virtual size_t get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const override;
+
+    virtual std::vector<size_t> get_view_input_stride(int node_idx,
+                                                      const std::string & name,
+                                                      size_t view_index) const override;
+
+    virtual std::vector<size_t> get_view_input_src_stride(int node_idx,
+                                                          const std::string & name,
+                                                          size_t view_index) const override;
+
+    virtual ov::Shape get_view_input_ggml_shape(int node_idx,
+                                                const std::string & name,
+                                                size_t view_index) const override;
+
+    virtual ov::Shape get_view_input_src_ggml_shape(int node_idx,
+                                                    const std::string & name,
+                                                    size_t view_index) const override;
+
+    virtual ov::PartialShape get_view_input_ov_shape(int node_idx,
+                                                     const std::string & name,
+                                                     size_t view_index) const override;
+
+    virtual ov::PartialShape get_view_input_src_ov_shape(int node_idx,
+                                                         const std::string & name,
+                                                         size_t view_index) const override;
+
+    virtual std::string get_view_input_name(int node_idx, const std::string & name, size_t view_index) const override;
+
+    virtual std::string get_view_input_src_name(int node_idx,
+                                                const std::string & name,
+                                                size_t view_index) const override;
+
    virtual ov::element::Type get_input_type(int node_idx, const std::string & name) const override;

    virtual size_t get_input_size() const override;
@@ -106,10 +146,14 @@ public:

    virtual ov::element::Type get_output_type(int node_idx) const override;

+    virtual std::vector<size_t> get_output_stride(int node_idx) const override;
+
    virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;

    virtual int32_t * get_output_op_params(int node_idx) const override;

+    virtual size_t get_output_op_offset(int node_idx) const override;
+
    virtual std::vector<std::string> get_output_names(int node_idx) const override;

    virtual const std::string & get_op_type() const override;
@@ -120,7 +164,10 @@ public:

    virtual const std::string & get_op_name(int node_idx) const override;

-    virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;
+    virtual int32_t get_op_dynamic_dim(int node_idx) const override;
+
+    virtual void visit_subgraph(
+        std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;

    ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }

@@ -142,16 +189,12 @@ public:
        return m_model_weights;
    }

-    virtual std::vector<std::string> get_model_output_names() const override {
-        return m_model_output_names;
-    }
+    virtual std::vector<std::string> get_model_output_names() const override { return m_model_output_names; }

    const std::map<std::string, ggml_tensor *> & get_model_outputs() const { return m_model_outputs; }

    virtual int get_ctx_size() const { return m_model_params.ctx; }

-    virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; }
-
    virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; }

    virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; }
@@ -169,13 +212,21 @@ public:

    virtual int32_t * get_rope_params() const override { return const_cast<int32_t *>(m_model_params.rope_params); }

+    virtual bool has_mixed_rope_params() const override { return m_model_params.mixed_rope_params; }
+
    virtual std::map<std::string, std::string> get_kv_param_res_names() const override;

    virtual bool is_static() const override { return m_is_static; }

    virtual bool is_stateful() const override { return m_is_stateful; }

-    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
+    int get_static_n_tokens() const { return m_is_prefill ? m_prefill_chunk_size : 1; }
+
+    virtual bool is_splited_model() const override { return m_model_is_splitted; }
+
+    ov::PartialShape get_graph_input_shape(const ggml_tensor * op,
+                                           const ggml_tensor * input,
+                                           int dynamic_dim_index = -1) const;

    static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);

@@ -205,6 +256,7 @@ public:
    bool m_is_prefill = false;
    bool m_naive = false;
    int m_prefill_chunk_size = 0;
+    bool m_model_is_splitted = false;  // label the cgraph is splited or not

    static ov::Shape get_shape(const ggml_tensor * tensor);
    static std::vector<size_t> get_stride(const ggml_tensor * tensor);
@@ -227,7 +279,8 @@ public:
    }

    inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
-        return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
+        return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]) ||
+               (op->op == GGML_OP_SOFT_MAX && tensor == op->src[1]);
    }

    inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) {
@@ -235,7 +288,8 @@ public:
    }

    inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
-        return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
+        return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY ||
+               (op != nullptr && op->op == GGML_OP_SET_ROWS && op->src[2] == tensor);
    }

    inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
@@ -243,23 +297,18 @@ public:
    }

    inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
-        return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE;
+        return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE &&
+               op->src[1]->op == GGML_OP_NONE;
    }

-    static std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) {
-        if (is_inp_tok(tensor, op)) {
-            return "inp_tokens";
-        }
+    std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) {
        if (is_inp_pos(tensor, op)) {
            return "inp_pos";
        }
        if (is_inp_emb(tensor, op)) {
            return "embd";
        }
-        if (is_output_idx(tensor, op)) {
-            return "inp_out_ids";
-        }
-        if (is_inp_mask(tensor, op)) {
+        if (is_stateful() && is_inp_mask(tensor, op)) {
            return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa";
        }
        return tensor->name;
@@ -272,6 +321,9 @@ private:
    void compute_model_inputs();
    void compute_model_outputs();

+    // Infer and propagate dynamic-dimension indices for all tensors in the GGML graph.
+    void compute_node_dynamic_dims();
+
    void validate_cgraph() const;

    ggml_cgraph * m_cgraph = nullptr;
@@ -284,6 +336,7 @@ private:
    std::map<std::string, ggml_tensor *> m_model_outputs;
    std::vector<std::string> m_model_output_names;
    std::vector<NodeInfo> m_node_info_list;
+    std::map<ggml_tensor *, int> m_node_dynamic_dims;

    ModelParams m_model_params;
    ComputeParams m_compute_params;
@@ -291,4 +344,4 @@ private:

 void print_tensor_address_map(const ggml_cgraph * cgraph);

-int extract_layer_from_name(const std::string & name);
+std::optional<int> extract_layer_from_name(const std::string & name);
@@ -3,6 +3,7 @@
 #include "ggml-impl.h"
 #include "ggml.h"

+#include <cstdlib>
 #include <cstring>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
@@ -22,7 +23,38 @@ void ggml_openvino_device_config::init() {
    if (initialized) {
        return;
    }
-    device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
+
+    // All recognized GGML_OPENVINO_* env vars. Their values are cached here
+    // once at backend init time and read back via ggml_openvino_getenv_str()
+    // (raw string) or ggml_openvino_getenv_int() (integer / boolean toggle).
+    static constexpr const char * env_var_names[] = {
+        // String values (use ggml_openvino_getenv_str)
+        "GGML_OPENVINO_DEVICE",
+        "GGML_OPENVINO_CACHE_DIR",
+        // Integer values (use ggml_openvino_getenv_int)
+        "GGML_OPENVINO_PREFILL_CHUNK_SIZE",
+        // Boolean toggles (treated as int flags via ggml_openvino_getenv_int)
+        "GGML_OPENVINO_STATEFUL_EXECUTION",
+        "GGML_OPENVINO_PROFILING",
+        "GGML_OPENVINO_DUMP_CGRAPH",
+        "GGML_OPENVINO_DUMP_IR",
+        "GGML_OPENVINO_DEBUG_INPUT",
+        "GGML_OPENVINO_DEBUG_OUTPUT",
+        "GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS",
+        "GGML_OPENVINO_ENABLE_CACHE",
+        "GGML_OPENVINO_DISABLE_CACHE",
+        "GGML_OPENVINO_DISABLE_KV_SLICE",
+        "GGML_OPENVINO_MANUAL_GQA_ATTN",
+    };
+
+    for (const char * const & env_var : env_var_names) {
+        auto * env = getenv(env_var);
+        if (env) {
+            environment_variables[env_var] = env;
+        }
+    }
+
+    device_name = ggml_openvino_getenv_str("GGML_OPENVINO_DEVICE", "CPU");
    auto available_devices = ov_singleton_core().get_available_devices();
    if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) {
        GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str());
@@ -30,7 +62,7 @@ void ggml_openvino_device_config::init() {
    }
    is_npu = (device_name == "NPU");

-    auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
+    const char * cache_dir = ggml_openvino_getenv_str("GGML_OPENVINO_CACHE_DIR");
    if (device_name == "NPU") {
        compile_config = {
            {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES"   },
@@ -119,6 +151,23 @@ const std::string & ggml_openvino_get_device_name() {
    return ggml_openvino_get_device_config().device_name;
 }

+// Get the value of a GGML_OPENVINO_* env var as a string. Returns
+// default_value when the var is unset or set to an empty string.
+const char * ggml_openvino_getenv_str(const char * var, const char * default_value) {
+    auto & env_map = ggml_openvino_get_device_config().environment_variables;
+    auto it = env_map.find(var);
+    return (it == env_map.end() || it->second.empty()) ? default_value : it->second.c_str();
+}
+
+// Get the value of a GGML_OPENVINO_* env var as an int (via std::atoi).
+// Returns default_value (0) when the var is unset or empty. Used for both
+// integer settings (e.g. GGML_OPENVINO_PREFILL_CHUNK_SIZE) and boolean
+// toggles: "0" disables, any non-zero integer enables.
+int ggml_openvino_getenv_int(const char * var, int default_value) {
+    const char * v = ggml_openvino_getenv_str(var, nullptr);
+    return v ? std::atoi(v) : default_value;
+}
+
 // Check if running on NPU
 bool ggml_openvino_is_npu() {
    return ggml_openvino_get_device_config().is_npu;
@@ -173,7 +222,8 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
        return std::nullopt;
    }
    if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
-        return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
+        return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 :
+                                                                             ExtraQuantType::Q8_0_C);
    }
    if (strncmp(tensor->name, "output.weight", 13) == 0) {
        return ExtraQuantType::Q8_0_C;
@@ -298,6 +348,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
        layout.is_symmetric = true;
        break;

+    case GGML_TYPE_Q5_1:
+        // u8 weights (5-bit values), asymmetric (scale + zero point)
+        break;
+
    case GGML_TYPE_Q6_K:
        layout.weights_per_block = 16;
        layout.is_symmetric = true;
@@ -64,6 +64,7 @@ struct ggml_openvino_device_config {
    bool initialized = false;
    std::optional<ov::RemoteContext> remote_context;
    ov::AnyMap compile_config;
+    std::unordered_map<std::string, std::string> environment_variables;
    cl_command_queue cl_queue = nullptr;

    void init();
@@ -79,6 +80,22 @@ void ggml_openvino_init_device_config();
 // Get the device name
 const std::string & ggml_openvino_get_device_name();

+// Environment variable accessors. All GGML_OPENVINO_* env vars are read once
+// during backend init and cached on the device config; consumers must go
+// through these helpers (never call ::getenv directly) so behavior stays
+// consistent and centralized.
+//
+// Use ggml_openvino_getenv_str() for string / path values
+// (e.g. GGML_OPENVINO_DEVICE, GGML_OPENVINO_CACHE_DIR). The optional
+// default_value is returned when the var is unset or empty.
+//
+// Use ggml_openvino_getenv_int() for boolean toggles and integer settings.
+// It returns std::atoi(value) when set, otherwise default_value. For
+// boolean use, `if (ggml_openvino_getenv_int(name))` is true iff the value
+// is a non-zero integer (so "0" disables, "1" enables).
+const char * ggml_openvino_getenv_str(const char * var, const char * default_value = nullptr);
+int ggml_openvino_getenv_int(const char * var, int default_value = 0);
+
 // Check if running on NPU
 bool ggml_openvino_is_npu();

@@ -115,9 +132,9 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {

 // Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node
 struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
-    ov::Tensor weights;   // U4 or U8 extracted weights
-    ov::Tensor scales;    // F16 scales
-    ov::Tensor zp;        // U4 or U8 zero points (same type as weights)
+    ov::Tensor weights;                     // U4 or U8 extracted weights
+    ov::Tensor scales;                      // F16 scales
+    ov::Tensor zp;                          // U4 or U8 zero points (same type as weights)
    std::shared_ptr<ov::Node> weight_node;  // Pre-built OpenVINO weight subgraph

    ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> n) :
@@ -132,8 +149,9 @@ struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
 struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
    std::shared_ptr<ov::Tensor> tensor;  // For direct use with infer_request

-    explicit ggml_openvino_tensor_extra(std::shared_ptr<ov::Tensor> t)
-        : ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {}
+    explicit ggml_openvino_tensor_extra(std::shared_ptr<ov::Tensor> t) :
+        ggml_openvino_extra_base(Type::TENSOR),
+        tensor(std::move(t)) {}
 };

 // =====================================================
@@ -152,11 +170,11 @@ struct ggml_openvino_extracted_layout {
    size_t zp_size = 0;         // Size of zero points in bytes (U4 or U8)
    bool is_u4;                 // true for U4 weights, false for U8
    int64_t weights_per_block;  // weights per scale/zp block
-    bool is_symmetric;        // true for symmetric quantization
+    bool is_symmetric;          // true for symmetric quantization

    // Requantization info
-    bool is_requant = false;                      // true if this tensor needs requantization
-    std::optional<ExtraQuantType> requant_type;   // target requant type if is_requant
+    bool is_requant = false;                     // true if this tensor needs requantization
+    std::optional<ExtraQuantType> requant_type;  // target requant type if is_requant
 };

 // Calculate the buffer layout for extracted quantized data
@@ -164,6 +182,9 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten

 ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);

+// Check if a tensor's buffer uses remote (device) memory (e.g. GPU USM)
+bool ggml_openvino_buffer_is_remote(const ggml_tensor * tensor);
+
 // Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
 // This sets tensor->extra and tracks the extra in the buffer context for cleanup.
 void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
@@ -4,13 +4,14 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "ggml-openvino-extra.h"
+#include "ggml-openvino/openvino/op_table.h"
 #include "ggml-openvino/utils.h"
 #include "ggml-quants.h"
 #include "ggml.h"

 #include <atomic>
-#include <cstdlib>
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <memory>
 #include <mutex>
@@ -146,8 +147,7 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
 }

 static bool is_stateful_enabled() {
-    static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
-    return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0;
+    return ggml_openvino_getenv_int("GGML_OPENVINO_STATEFUL_EXECUTION") != 0;
 }

 static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
@@ -367,11 +367,9 @@ static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer
            ggml_backend_openvino_buffer_context * src_ctx =
                (ggml_backend_openvino_buffer_context *) src->buffer->context;
            if (src_ctx->is_remote) {
-                cl_int err =
-                    mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
+                cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
                if (err != CL_SUCCESS) {
-                    GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__,
-                                   err);
+                    GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__, err);
                    return false;
                }
                return true;
@@ -579,6 +577,17 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
    return ctx->id;
 }

+bool ggml_openvino_buffer_is_remote(const ggml_tensor * tensor) {
+    if (tensor == nullptr || tensor->buffer == nullptr) {
+        return false;
+    }
+    if (!ggml_backend_buffer_is_openvino(tensor->buffer)) {
+        return false;
+    }
+    auto * ctx = static_cast<ggml_backend_openvino_buffer_context *>(tensor->buffer->context);
+    return ctx->is_remote;
+}
+
 void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) {
    GGML_ASSERT(tensor != nullptr);
    GGML_ASSERT(tensor->buffer != nullptr);
@@ -785,6 +794,18 @@ static bool has_view_op_input(const ggml_tensor * op) {
    return false;
 }

+static bool has_non_contiguous_view_input(const ggml_tensor * op) {
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] == nullptr) {
+            break;
+        }
+        if (op->src[i]->op == GGML_OP_VIEW && !ggml_is_contiguous(op->src[i])) {
+            return true;
+        }
+    }
+    return false;
+}
+
 static bool is_supported_flash_attn_pattern(const ggml_tensor * op) {
    // pattern of q,k,v should be q->op==PERMUTE, q->src[0]->op==VIEW, q->src[0]->src[0]->view_src==nullptr
    for (int i = 0; i < 3; i++) {
@@ -797,17 +818,107 @@ static bool is_supported_flash_attn_pattern(const ggml_tensor * op) {
    return true;
 }

+static bool is_gemma3n_flash_attn_pattern(const ggml_tensor * op) {
+    if (!is_supported_flash_attn_pattern(op)) {
+        return false;
+    }
+
+    const ggml_tensor * q_base =
+        op->src[0] != nullptr && op->src[0]->src[0] != nullptr ? op->src[0]->src[0]->src[0] : nullptr;
+    const ggml_tensor * k_base =
+        op->src[1] != nullptr && op->src[1]->src[0] != nullptr ? op->src[1]->src[0]->src[0] : nullptr;
+    const ggml_tensor * v_base =
+        op->src[2] != nullptr && op->src[2]->src[0] != nullptr ? op->src[2]->src[0]->src[0] : nullptr;
+
+    if (q_base == nullptr || q_base->op != GGML_OP_ROPE) {
+        return false;
+    }
+
+    // gemma3n direct attention path (no KV cache): q=ROPE, k=ROPE, v=RMS_NORM
+    // Only match this specific pattern to avoid falsely catching other models
+    // (e.g. Gemma4) that also use scale=1.0 with KV-cache backed attention.
+    const bool is_qkv_direct =
+        k_base != nullptr && v_base != nullptr && k_base->op == GGML_OP_ROPE && v_base->op == GGML_OP_RMS_NORM;
+
+    return is_qkv_direct;
+}
+
+static bool checked_mul_size(size_t a, size_t b, size_t & out) {
+    if (a == 0 || b == 0) {
+        out = 0;
+        return true;
+    }
+    if (a > SIZE_MAX / b) {
+        return false;
+    }
+    out = a * b;
+    return true;
+}
+
+static bool mul_mat_id_requires_large_tmp(const ggml_tensor * op) {
+    const ggml_tensor * as = op->src[0];
+    const ggml_tensor * ids = op->src[2];
+    if (as == nullptr || ids == nullptr) {
+        return true;
+    }
+
+    // The current OpenVINO translation materializes selected expert weights with
+    // shape [n_tokens, n_used, rows, k]. Skip cases that would create a very
+    // large temporary on GPU and let the scheduler fall back instead.
+    size_t tmp_elems = 1;
+    if (!checked_mul_size(tmp_elems, static_cast<size_t>(ids->ne[1]), tmp_elems) ||
+        !checked_mul_size(tmp_elems, static_cast<size_t>(ids->ne[0]), tmp_elems) ||
+        !checked_mul_size(tmp_elems, static_cast<size_t>(as->ne[1]), tmp_elems) ||
+        !checked_mul_size(tmp_elems, static_cast<size_t>(as->ne[0]), tmp_elems)) {
+        return true;
+    }
+
+    size_t tmp_bytes = 0;
+    if (!checked_mul_size(tmp_elems, sizeof(float), tmp_bytes)) {
+        return true;
+    }
+
+    static constexpr size_t mul_mat_id_tmp_limit = 1ULL << 30;  // 1 GiB
+    return tmp_bytes > mul_mat_id_tmp_limit;
+}
+
 static bool is_op_unsupported_case(const ggml_tensor * op) {
    switch (op->op) {
+    case GGML_OP_CONCAT: {
+        if (op->type == GGML_TYPE_I64) {
+            return true;
+        }
+        break;
+    }
    case GGML_OP_GET_ROWS:
    case GGML_OP_SET_ROWS: {
        if (op->ne[3] != 1) {
            return true;
        }
+        if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
+            // ERR = 0.000000306 > 0.000000100   GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
+            // ERR = 0.000000197 > 0.000000100   GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
+            return true;
+        }
+
+        // Keep the MoE routing weights gather on CPU for GPU runs. Splitting
+        // only at the later SUM/CLAMP/DIV nodes still leaves this routing path
+        // numerically unstable for arctic-style MoE graphs.
+        if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
+            return true;
+        }
+        break;
+    }
+    case GGML_OP_RESHAPE: {
+        if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 ||
+            strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
+            return true;
+        }
        break;
    }
    case GGML_OP_ADD:
-    case GGML_OP_MUL: {
+    case GGML_OP_MUL:
+    case GGML_OP_SUB: {
        if (op->src[1]->op == GGML_OP_PERMUTE) {
            return true;
        }
@@ -818,30 +929,79 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
        }
        break;
    }
+    case GGML_OP_ADD_ID: {
+        // Keep support aligned with the CPU backend implementation, which only handles f32 inputs/output and i32 ids.
+        if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32 ||
+            op->src[2]->type != GGML_TYPE_I32) {
+            return true;
+        }
+        break;
+    }
+    case GGML_OP_DIV: {
+        bool requires_broadcast = false;
+        for (int i = 0; i < 4; i++) {
+            if (op->src[0]->ne[i] == op->src[1]->ne[i]) {
+                continue;
+            }
+
+            if (op->src[0]->ne[i] != 1 && op->src[1]->ne[i] != 1) {
+                return true;
+            }
+
+            requires_broadcast = true;
+        }
+
+        // The GPU plugin can fuse broadcast DIV into the preceding FFN GEMM path
+        // and produce infs for per-channel scale vectors. Keep those DIVs on CPU
+        // until the fused GPU kernel is reliable. (falied case llama-arch-test mpt)
+        if (requires_broadcast && ggml_openvino_get_device_name() == "GPU") {
+            return true;
+        }
+
+        // qwen3next MoE weight normalization is numerically sensitive on the GPU
+        // path. Keep the normalization divide on CPU to match the reference.
+        if (strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) {
+            return true;
+        }
+        break;
+    }
    case GGML_OP_SOFT_MAX: {
        if (op->src[2] != nullptr) {
            // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
            return true;
        }
-        float scale = 1.0f;
-        float max_bias = 0.0f;
-        const auto * op_params = op->op_params;
-        memcpy(&scale, (const float *) op_params + 0, sizeof(float));
-        memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
-        if (max_bias > 0) {
-            // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
+
+        if (strncmp(op->name, "ffn_moe_probs", sizeof("ffn_moe_probs") - 1) == 0) {
+            return true;
+        }
+
+        // GPU execution of the MoE routing weights softmax is numerically unstable
+        // when fused with the surrounding GET_ROWS/reshape path. Keep this softmax
+        // on CPU so the scheduler splits at the same boundary that restores parity.
+        if (op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE && op->src[0]->src[0] != nullptr &&
+            strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
+            return true;
+        }
+        break;
+    }
+    case GGML_OP_SUM_ROWS: {
+        if (strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) {
+            return true;
+        }
+
+        // if the input is PERMUTE skip
+        if (op->src[0]->op == GGML_OP_PERMUTE) {
+            return true;
+        }
+        break;
+    }
+    case GGML_OP_CLAMP: {
+        if (strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) {
            return true;
        }
        break;
    }
    case GGML_OP_FLASH_ATTN_EXT: {
-        if (op->src[4] != nullptr) {
-            // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n");
-            return true;
-        }
-        if (!is_supported_flash_attn_pattern(op)) {
-            return true;
-        }
        float scale = 1.0f;
        float max_bias = 0.0f;
        float logit_softcap = 0.0f;
@@ -849,6 +1009,21 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
        memcpy(&scale, (const float *) op_params + 0, sizeof(float));
        memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
        memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float));
+
+        // Keep gemma3n flash-attn pattern on CPU for GPU runs to avoid
+        // accuracy drift in the OpenVINO path. Restrict by scale=1.0 to avoid
+        // affecting non-gemma3n models such as Llama-3.2.
+        if (fabsf(scale - 1.0f) < 1e-6f && is_gemma3n_flash_attn_pattern(op)) {
+            return true;
+        }
+
+        if (op->src[4] != nullptr) {
+            // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n");
+            return true;
+        }
+        if (!is_supported_flash_attn_pattern(op)) {
+            return true;
+        }
        if (max_bias > 0) {
            // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n");
            return true;
@@ -868,34 +1043,44 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
        break;
    }
    case GGML_OP_CPY: {
-        if (op->src[1] != op) {
-            // GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n");
+        if (op->src[0]->type == GGML_TYPE_BF16 || op->src[1]->type == GGML_TYPE_BF16) {
+            // GGML_LOG_WARN("OpenVINO backend does not support CPY with non-contiguous data or bf16 types\n");
+            return true;
+        }
+        // op test case with non-contiguous src or dst
+        if ((op->ne[0] == 3 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2) ||
+            (op->ne[0] == 1 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2) ||
+            (op->ne[0] == 2 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2)) {
            return true;
        }
        break;
    }
    case GGML_OP_MUL_MAT: {
-        if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) {
-            // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"`
-            // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n");
+        if (ggml_openvino_get_device_name() == "GPU" && op->src[1]->op == GGML_OP_SOFT_MAX &&
+            op->src[0]->op == GGML_OP_CONT && op->src[0]->src[0] != nullptr &&
+            op->src[0]->src[0]->op == GGML_OP_TRANSPOSE && op->src[0]->src[0]->src[0] != nullptr &&
+            op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) {
            return true;
        }
        if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
            return true;
        }
-        if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) {
-            return true;
-        }
-        if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) {
-            // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1)
-            // triggers a bug in ov matmul_shape_inference.hpp
-            return true;
-        }
        if (op->src[0]->op == GGML_OP_VIEW && op->src[1]->op == GGML_OP_VIEW) {
            return true;
        }
        break;
    }
+    case GGML_OP_MUL_MAT_ID: {
+        if (strncmp(op->name, "ffn_moe_gate_up", sizeof("ffn_moe_gate_up") - 1) == 0 ||
+            strncmp(op->name, "ffn_moe_down", sizeof("ffn_moe_down") - 1) == 0) {
+            return true;
+        }
+
+        if (mul_mat_id_requires_large_tmp(op)) {
+            return true;
+        }
+        break;
+    }
    case GGML_OP_ROPE: {
        const int32_t * op_params = op->op_params;
        const int n_dims = op_params[1];
@@ -909,7 +1094,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
            //               op->src[0]->ne[0]);
            return true;
        }
-        if (op->type != GGML_TYPE_F32) {
+        if (op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) {
            // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
            return true;
        }
@@ -930,15 +1115,54 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
        }
        break;
    }
-    default:
-        break;
-    }
-    if (op->op == GGML_OP_GET_ROWS) {
-        if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
-            // ERR = 0.000000306 > 0.000000100   GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
-            // ERR = 0.000000197 > 0.000000100   GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
+    case GGML_OP_TRANSPOSE: {
+        // if the type is bf16, will return true
+        if (op->type == GGML_TYPE_BF16) {
+            // GGML_LOG_WARN("OpenVINO backend does not support CONT with BF16 type\n");
            return true;
        }
+        break;
+    }
+    case GGML_OP_GATED_DELTA_NET: {
+        // enable after https://github.com/openvinotoolkit/openvino/pull/35917 is included in OV release
+        return true;
+        // if (ggml_openvino_get_device_name() == "GPU" && op->src[0]->ne[2] > 1) {
+        //     // CVS-186471
+        //     return true;
+        // }
+        if (op->src[2]->op == GGML_OP_PERMUTE) {
+            return true;
+        }
+        // kda (per-key-dimension gating) not supported by fused GatedDeltaNet op
+        if (op->src[3]->ne[0] != 1) {
+            return true;
+        }
+        // v_repeat > 1 (GQA): ggml uses modulo head mapping (h_q = h_v % H_k)
+        // but the fused op uses consecutive mapping (h_q = h_v / group_size)
+        if (op->src[2]->ne[1] != op->src[0]->ne[1]) {
+            return true;
+        }
+        // K > 1 (multiple state snapshots) not supported by fused op
+        if (op->src[5]->ne[1] > 1) {
+            return true;
+        }
+        break;
+    }
+    case GGML_OP_SSM_CONV: {
+        // qwen3next is numerically unstable with OpenVINO SSM_CONV.
+        // Keep this op on CPU until the OpenVINO implementation is fixed.
+        return true;
+    }
+    case GGML_OP_VIEW: {
+        // Skip TOPK_MOE fused tests until it is fully supported
+        // the argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe
+        if (strcmp(op->name, "selected_experts") == 0) {
+            return true;
+        }
+        break;
+    }
+    default:
+        break;
    }
    return false;
 }
@@ -946,24 +1170,47 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
 static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    GGML_ASSERT(dev->reg != nullptr);

-    static std::set<ggml_type> supported_types{GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16, GGML_TYPE_I64,
-                                               GGML_TYPE_I32,  GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K,
-                                               GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
+    static std::unordered_set<ggml_type> supported_types{
+        GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16, GGML_TYPE_I64,  GGML_TYPE_I32,  GGML_TYPE_Q4_0,
+        GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};

-    static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
-                                                 /*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
-                                                 GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
-                                                 // softmax is not updated due to replaced by flash_attn_ext
-                                                 // GGML_OP_SOFT_MAX,
-                                                 GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
-    static const std::set<ggml_unary_op> supported_unary_ops{
-        GGML_UNARY_OP_GELU,
-        GGML_UNARY_OP_SILU,
-    };
-    static const std::set<ggml_glu_op> supported_glu_ops{
-        GGML_GLU_OP_SWIGLU,
-        GGML_GLU_OP_GEGLU,
+    // derive supported op sets from the op_table map, keys in
+    // the map use the full macro name (e.g. "GGML_OP_ADD"), while
+    // the ggml_*_op_name() helpers return only the trailing part (e.g. "ADD").
+    // each set is built once and cached.
+    static const auto build_supported_sets = [] {
+        const auto & table = ov::frontend::ggml::get_supported_ops();
+        std::unordered_set<ggml_op> ops;
+        std::unordered_set<ggml_unary_op> unary_ops;
+        std::unordered_set<ggml_glu_op> glu_ops;
+
+        // GGML_OP_NONE has no translator but is always safe to add to the supported set.
+        ops.insert(GGML_OP_NONE);
+
+        for (int i = 0; i < GGML_OP_COUNT; ++i) {
+            const std::string key = std::string("GGML_OP_") + ggml_op_name(static_cast<ggml_op>(i));
+            if (table.count(key)) {
+                ops.insert(static_cast<ggml_op>(i));
+            }
+        }
+        for (int i = 0; i < GGML_UNARY_OP_COUNT; ++i) {
+            const std::string key = std::string("GGML_UNARY_OP_") + ggml_unary_op_name(static_cast<ggml_unary_op>(i));
+            if (table.count(key)) {
+                unary_ops.insert(static_cast<ggml_unary_op>(i));
+            }
+        }
+        for (int i = 0; i < GGML_GLU_OP_COUNT; ++i) {
+            const std::string key = std::string("GGML_GLU_OP_") + ggml_glu_op_name(static_cast<ggml_glu_op>(i));
+            if (table.count(key)) {
+                glu_ops.insert(static_cast<ggml_glu_op>(i));
+            }
+        }
+        return std::make_tuple(ops, unary_ops, glu_ops);
    };
+    static const auto supported_sets = build_supported_sets();
+    static const auto & supported_ops = std::get<0>(supported_sets);
+    static const auto & supported_unary_ops = std::get<1>(supported_sets);
+    static const auto & supported_glu_ops = std::get<2>(supported_sets);

    switch (op->op) {
    case GGML_OP_UNARY: {
@@ -972,11 +1219,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
            // GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op)));
            return false;
        }
-        if (has_view_op_input(op)) {
-            // GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n",
-            //               ggml_unary_op_name(ggml_get_unary_op(op)));
-            return false;
-        }
        break;
    }
    case GGML_OP_GLU: {
@@ -1003,13 +1245,15 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
            return false;
        }
        static std::set<ggml_op> ops_not_support_view_input{
-            GGML_OP_GET_ROWS,
-            GGML_OP_RMS_NORM,
+            GGML_OP_L2_NORM,
        };
        if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_op_input(op)) {
            // GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op));
            return false;
        }
+        if (op->op == GGML_OP_RMS_NORM && has_non_contiguous_view_input(op)) {
+            return false;
+        }
    }
    }

@@ -126,6 +126,68 @@ void extract_q4_1_data(const ggml_tensor * tensor,
    }
 }

+// Extracts (weight, scales, zp) from Q5_1 tensors.
+// Data layout is: |16 bit scale|16 bit min|32 bit qh (5th bits)|32 x 4bit low nibbles|.
+// Reconstructed quant q in [0,31]: q = (low nibble) | (qh_bit << 4). Dequant: w*d + m.
+// Weights are stored as u8 (5-bit values do not fit u4), matching make_int8_weights.
+void extract_q5_1_data(const ggml_tensor * tensor,
+                       ov::Tensor & weights_arr,
+                       ov::Tensor & scales_arr,
+                       ov::Tensor & zp_arr,
+                       bool use_bias) {
+    const uint64_t bytes_per_block = 24;  // 2 scale + 2 min + 4 qh + 16 (32x0.5) weights
+    const int qk = 32;
+
+    auto * data = static_cast<uint8_t *>(tensor->data);
+    auto * weights = static_cast<uint8_t *>(weights_arr.data());  // u8 weights, one byte per weight
+    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+
+    // Read a 16-bit little-endian value without aliasing/const-qual violations.
+    auto read_u16 = [](const uint8_t * p) {
+        uint16_t v;
+        memcpy(&v, p, sizeof(v));
+        return v;
+    };
+
+    auto unpack_block = [&](const uint8_t * block, uint8_t * dst) {
+        uint32_t qh;
+        memcpy(&qh, block + 4, sizeof(uint32_t));
+        const uint8_t * qs = block + 8;
+        for (int j = 0; j < qk / 2; ++j) {
+            const uint8_t lo = qs[j] & 0x0F;
+            const uint8_t hi = qs[j] >> 4;
+            const uint8_t bit_lo = (qh >> j) & 1;
+            const uint8_t bit_hi = (qh >> (j + qk / 2)) & 1;
+            dst[j] = lo | (bit_lo << 4);           // first 16 weights
+            dst[j + qk / 2] = hi | (bit_hi << 4);  // last 16 weights
+        }
+    };
+
+    if (use_bias) {
+        // Store bias (min) directly as f16: dequant w*d + m
+        auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+            const uint8_t * block = data + i * bytes_per_block;
+            float scale = static_cast<float>(ov::float16::from_bits(read_u16(block)));
+            float min = static_cast<float>(ov::float16::from_bits(read_u16(block + 2)));
+            scales[i] = ov::float16(scale);
+            bias[i] = ov::float16(min);
+            unpack_block(block, weights + i * qk);
+        });
+    } else {
+        auto * zp = static_cast<uint8_t *>(zp_arr.data());  // u8 zero points
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+            const uint8_t * block = data + i * bytes_per_block;
+            float scale = static_cast<float>(ov::float16::from_bits(read_u16(block)));
+            float min = static_cast<float>(ov::float16::from_bits(read_u16(block + 2)));
+            scales[i] = ov::float16(scale);
+            // zp = -min / scale (dequant: (w - zp) * s == w*s + min)
+            zp[i] = (scale != 0.0f) ? (uint8_t) std::lround(-min / scale) : 0;
+            unpack_block(block, weights + i * qk);
+        });
+    }
+}
+
 // Extracts (weight, scales, zp) from Q8_0 tensors.
 // Data layout is: |16 bit scale|32 x 8bit weights|.
 // When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
@@ -577,6 +639,7 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
        weights_per_block = 32;
        break;
    case GGML_TYPE_Q8_0:
+    case GGML_TYPE_Q5_1:
    case GGML_TYPE_Q5_K:
        is_u4 = false;
        weights_per_block = 32;
@@ -601,6 +664,9 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
    case GGML_TYPE_Q4_K:
        extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
        break;
+    case GGML_TYPE_Q5_1:
+        extract_q5_1_data(&temp_tensor, weights, scales, zp, use_bias);
+        break;
    case GGML_TYPE_Q8_0:
        extract_q8_0_data(&temp_tensor, weights, scales, zp);
        break;
@@ -6,7 +6,7 @@
 #include <openvino/op/constant.hpp>
 #include <openvino/runtime/tensor.hpp>

-void unpack_32_4(const uint8_t* data, uint8_t* dst);
+void unpack_32_4(const uint8_t * data, uint8_t * dst);

 void extract_q4_0_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
@@ -19,12 +19,18 @@ void extract_q4_1_data(const ggml_tensor * tensor,
                       ov::Tensor & zp_arr,
                       bool use_bias = false);

+void extract_q5_1_data(const ggml_tensor * tensor,
+                       ov::Tensor & weights_arr,
+                       ov::Tensor & scales_arr,
+                       ov::Tensor & zp_arr,
+                       bool use_bias = false);
+
 void extract_q8_0_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
                       ov::Tensor & zp_arr);

-void unpack_256_4(const uint8_t* data, uint8_t* dst);
+void unpack_256_4(const uint8_t * data, uint8_t * dst);

 void extract_q4_k_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
@@ -145,8 +151,8 @@ namespace ov {
 namespace op {
 namespace util {
 // From <openvino>/src/common/transformations/include/transformations/utils/utils.hpp
-bool get_single_value(const std::shared_ptr<ov::op::v0::Constant>& const_node,
-                      float& value,
+bool get_single_value(const std::shared_ptr<ov::op::v0::Constant> & const_node,
+                      float & value,
                      bool check_value_range = true);
 }  // namespace util
 }  // namespace op
@@ -3,6 +3,8 @@
 #include <cstdint>
 #include <map>
 #include <openvino/core/node.hpp>
+#include <openvino/core/partial_shape.hpp>
+#include <openvino/core/shape.hpp>
 #include <openvino/frontend/decoder.hpp>
 #include <string>

@@ -12,22 +14,50 @@ namespace ggml {

 class GgmlDecoder : public DecoderBase {
 public:
-    virtual ov::Any get_attribute(const std::string& name) const = 0;
+    virtual ov::Any get_attribute(const std::string & name) const = 0;

-    virtual PartialShape get_input_shape(int node_idx, const std::string& name) const = 0;
+    virtual PartialShape get_input_shape(int node_idx, const std::string & name) const = 0;

-    virtual std::vector<size_t> get_input_stride(int node_idx, const std::string& name) const = 0;
+    virtual std::vector<size_t> get_input_stride(int node_idx, const std::string & name) const = 0;

-    virtual element::Type get_input_type(int node_idx, const std::string& name) const = 0;
+    virtual size_t get_view_input_size(int node_idx, const std::string & name) const = 0;
+
+    virtual size_t get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const = 0;
+
+    virtual size_t get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const = 0;
+
+    virtual std::vector<size_t> get_view_input_stride(int node_idx,
+                                                      const std::string & name,
+                                                      size_t view_index) const = 0;
+
+    virtual std::vector<size_t> get_view_input_src_stride(int node_idx,
+                                                          const std::string & name,
+                                                          size_t view_index) const = 0;
+
+    virtual Shape get_view_input_ggml_shape(int node_idx, const std::string & name, size_t view_index) const = 0;
+
+    virtual Shape get_view_input_src_ggml_shape(int node_idx, const std::string & name, size_t view_index) const = 0;
+
+    virtual PartialShape get_view_input_ov_shape(int node_idx, const std::string & name, size_t view_index) const = 0;
+
+    virtual PartialShape get_view_input_src_ov_shape(int node_idx,
+                                                     const std::string & name,
+                                                     size_t view_index) const = 0;
+
+    virtual std::string get_view_input_name(int node_idx, const std::string & name, size_t view_index) const = 0;
+
+    virtual std::string get_view_input_src_name(int node_idx, const std::string & name, size_t view_index) const = 0;
+
+    virtual element::Type get_input_type(int node_idx, const std::string & name) const = 0;

    virtual size_t get_input_size() const = 0;

    virtual size_t get_input_size(int node_idx) const = 0;

    virtual void get_input_node(size_t input_port_idx,
-                                std::string& producer_name,
-                                std::string& producer_output_port_name,
-                                size_t& producer_output_port_index) const = 0;
+                                std::string & producer_name,
+                                std::string & producer_output_port_name,
+                                size_t & producer_output_port_index) const = 0;

    virtual std::vector<std::string> get_input_names(int node_idx) const = 0;

@@ -35,30 +65,36 @@ public:

    virtual element::Type get_output_type(const int node_idx) const = 0;

-    virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0;
+    virtual std::vector<size_t> get_output_stride(int node_idx) const = 0;
+
+    virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const = 0;

    virtual int32_t * get_output_op_params(int node_idx) const = 0;

+    virtual size_t get_output_op_offset(int node_idx) const = 0;
+
    virtual std::vector<std::string> get_output_names(int node_idx) const = 0;

-    virtual const std::string& get_op_type() const = 0;
+    virtual const std::string & get_op_type() const = 0;

-    virtual const std::string& get_op_type(int node_idx) const = 0;
+    virtual const std::string & get_op_type(int node_idx) const = 0;

-    virtual const std::string& get_op_name() const = 0;
+    virtual const std::string & get_op_name() const = 0;

-    virtual const std::string& get_op_name(int node_idx) const = 0;
+    virtual const std::string & get_op_name(int node_idx) const = 0;

    virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const = 0;

    virtual int get_op_case(int node_idx) const = 0;

-    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
-    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
-    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_inputs() const = 0;
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_extra_inputs() const = 0;
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_weights() const = 0;
    virtual std::vector<std::string> get_model_output_names() const = 0;

-    virtual int32_t* get_rope_params() const = 0;
+    virtual int32_t * get_rope_params() const = 0;
+
+    virtual bool has_mixed_rope_params() const = 0;

    virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;

@@ -66,7 +102,11 @@ public:

    virtual bool is_stateful() const = 0;

+    virtual bool is_splited_model() const = 0;
+
    virtual int is_swa_layer(int layer) const = 0;
+
+    virtual int32_t get_op_dynamic_dim(int node_idx) const = 0;
 };

 }  // namespace ggml
@@ -15,7 +15,7 @@ public:
    using Ptr = std::shared_ptr<FrontEnd>;
    FrontEnd();

-    static std::shared_ptr<Model> convert(const InputModel::Ptr& model, bool naive = false);
+    static std::shared_ptr<Model> convert(const InputModel::Ptr & model, bool naive = false);
 };

 }  // namespace ggml
@@ -1,9 +1,9 @@
 #pragma once

-#include <openvino/frontend/input_model.hpp>
-
 #include "decoder.h"

+#include <openvino/frontend/input_model.hpp>
+
 namespace ov {
 namespace frontend {
 namespace ggml {
@@ -16,9 +16,9 @@ class InputModel : public ov::frontend::InputModel {
    friend class ::ov::frontend::ggml::FrontEnd;

 public:
-    explicit InputModel(const std::shared_ptr<GgmlDecoder>& gdecoder);
+    explicit InputModel(const std::shared_ptr<GgmlDecoder> & gdecoder);

-    const std::shared_ptr<GgmlDecoder>& get_model_decoder() const;
+    const std::shared_ptr<GgmlDecoder> & get_model_decoder() const;

 private:
    std::shared_ptr<GgmlDecoder> m_decoder;
@@ -1,11 +1,11 @@
 #pragma once

+#include "decoder.h"
+
 #include <cstdint>
 #include <openvino/frontend/node_context.hpp>
 #include <string>

-#include "decoder.h"
-
 namespace ov {
 namespace frontend {
 namespace ggml {
@@ -16,28 +16,24 @@ typedef std::map<std::string, Output<Node>> TensorMap;

 class NodeContext : public frontend::NodeContext {
 public:
-    NodeContext(const std::shared_ptr<GgmlDecoder>& decoder,
-                std::shared_ptr<TensorMap>& tensor_map,
+    NodeContext(const std::shared_ptr<GgmlDecoder> & decoder,
+                std::shared_ptr<TensorMap> & tensor_map,
                int node_idx,
-                TranslateSession* translate_session = nullptr)
-        : ov::frontend::NodeContext(decoder->get_op_type(node_idx)),
-          m_decoder(decoder),
-          m_tensor_map(tensor_map),
-          m_node_idx(node_idx),
-          m_translate_session(translate_session) {
+                TranslateSession * translate_session = nullptr) :
+        ov::frontend::NodeContext(decoder->get_op_type(node_idx)),
+        m_decoder(decoder),
+        m_tensor_map(tensor_map),
+        m_node_idx(node_idx),
+        m_translate_session(translate_session) {
        m_input_names = decoder->get_input_names(m_node_idx);
        m_output_names = decoder->get_output_names(m_node_idx);
    }

-    TranslateSession* get_translate_session() const {
-        return m_translate_session;
-    }
+    TranslateSession * get_translate_session() const { return m_translate_session; }

-    const std::vector<std::string>& get_input_names() const { return m_input_names; }
+    const std::vector<std::string> & get_input_names() const { return m_input_names; }

-    size_t get_input_size() const override {
-        return m_decoder->get_input_size(m_node_idx);
-    }
+    size_t get_input_size() const override { return m_decoder->get_input_size(m_node_idx); }

    ov::element::Type get_input_type(size_t index) const {
        return m_decoder->get_input_type(m_node_idx, m_input_names[index]);
@@ -55,42 +51,103 @@ public:

    PartialShape get_output_shape() const { return m_decoder->get_output_shape(m_node_idx); }

-    int32_t* get_input_op_params(size_t index) const {
+    int32_t * get_input_op_params(size_t index) const {
        return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]);
    }

-    int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
-
-    ov::element::Type get_output_type() const {
-        return m_decoder->get_output_type(m_node_idx);
+    size_t get_view_input_size(size_t index) const {
+        return m_decoder->get_view_input_size(m_node_idx, m_input_names[index]);
    }

+    size_t get_view_input_offset(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_offset(m_node_idx, m_input_names[index], view_index);
+    }
+
+    size_t get_view_input_src_offset(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_src_offset(m_node_idx, m_input_names[index], view_index);
+    }
+
+    std::vector<size_t> get_view_input_stride(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_stride(m_node_idx, m_input_names[index], view_index);
+    }
+
+    std::vector<size_t> get_view_input_src_stride(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_src_stride(m_node_idx, m_input_names[index], view_index);
+    }
+
+    ov::Shape get_view_input_ggml_shape(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_ggml_shape(m_node_idx, m_input_names[index], view_index);
+    }
+
+    ov::Shape get_view_input_src_ggml_shape(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_src_ggml_shape(m_node_idx, m_input_names[index], view_index);
+    }
+
+    ov::PartialShape get_view_input_ov_shape(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_ov_shape(m_node_idx, m_input_names[index], view_index);
+    }
+
+    ov::PartialShape get_view_input_src_ov_shape(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_src_ov_shape(m_node_idx, m_input_names[index], view_index);
+    }
+
+    std::string get_view_input_name(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_name(m_node_idx, m_input_names[index], view_index);
+    }
+
+    std::string get_view_input_src_name(size_t index, size_t view_index) const {
+        return m_decoder->get_view_input_src_name(m_node_idx, m_input_names[index], view_index);
+    }
+
+    int32_t get_op_dynamic_dim() const { return m_decoder->get_op_dynamic_dim(m_node_idx); }
+
+    int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
+
+    size_t get_output_op_offset() const { return m_decoder->get_output_op_offset(m_node_idx); }
+
+    ov::element::Type get_output_type() const { return m_decoder->get_output_type(m_node_idx); }
+
+    std::vector<size_t> get_output_stride() const { return m_decoder->get_output_stride(m_node_idx); }
+
    Output<Node> get_input(int idx) const override {
+        // Check if this input is a VIEW
+        size_t view_input_size = m_decoder->get_view_input_size(m_node_idx, m_input_names[idx]);
+        if (view_input_size > 0) {
+            // This is a VIEW input, get the base tensor name (last element in the chain)
+            std::string base_name =
+                m_decoder->get_view_input_src_name(m_node_idx, m_input_names[idx], view_input_size - 1);
+            // Check if the VIEW has been resolved (translate_view produced a Slice)
+            auto view_it = m_tensor_map->find(m_input_names[idx]);
+            if (!base_name.empty() && view_it != m_tensor_map->end()) {
+                auto base_it = m_tensor_map->find(base_name);
+                if (base_it != m_tensor_map->end() &&
+                    view_it->second.get_node_shared_ptr() != base_it->second.get_node_shared_ptr()) {
+                    return view_it->second;
+                }
+                return base_it->second;
+            }
+            if (!base_name.empty()) {
+                return m_tensor_map->at(base_name);
+            }
+        }
+        // Not a VIEW or failed to get base name, use the original logic
        return m_tensor_map->at(m_input_names[idx]);
    }

-    Output<Node> get_input(const std::string& name) const override {
+    Output<Node> get_input(const std::string & name) const override {
        if (m_tensor_map->find(name) == m_tensor_map->end()) {
            throw std::runtime_error("'" + name + "' not found in tensor map.");
        }
        return m_tensor_map->at(name);
    }

-    bool has_input(const std::string& name) const {
-        return m_tensor_map->find(name) != m_tensor_map->end();
-    }
+    bool has_input(const std::string & name) const { return m_tensor_map->find(name) != m_tensor_map->end(); }

-    const std::string& get_name() const override {
-        return m_decoder->get_op_name(m_node_idx);
-    }
+    const std::string & get_name() const override { return m_decoder->get_op_name(m_node_idx); }

-    ov::Any get_attribute_as_any(const std::string& name) const override {
-        return m_decoder->get_attribute(name);
-    }
+    ov::Any get_attribute_as_any(const std::string & name) const override { return m_decoder->get_attribute(name); }

-    int get_op_case() const {
-        return m_decoder->get_op_case(m_node_idx);
-    }
+    int get_op_case() const { return m_decoder->get_op_case(m_node_idx); }

    bool is_static() const { return m_decoder->is_static(); }

@@ -98,14 +155,14 @@ public:

 private:
    std::shared_ptr<GgmlDecoder> m_decoder;
-    std::shared_ptr<TensorMap>& m_tensor_map;
+    std::shared_ptr<TensorMap> & m_tensor_map;
    int m_node_idx;
-    TranslateSession* m_translate_session;
+    TranslateSession * m_translate_session;
    std::vector<std::string> m_input_names;
    std::vector<std::string> m_output_names;
 };

-using CreatorFunction = std::function<ov::OutputVector(const ov::frontend::ggml::NodeContext&)>;
+using CreatorFunction = std::function<ov::OutputVector(const ov::frontend::ggml::NodeContext &)>;

 }  // namespace ggml
 }  // namespace frontend
@@ -0,0 +1,62 @@
+#include "../node_context.h"
+#include "../op_table.h"
+#include "../utils.h"
+
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/add.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/gather.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/shape_of.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_add_id(const NodeContext & context) {
+    num_inputs_check(context, 3, 3);
+
+    auto input = process_view_input_new(context, 0);
+    auto bias = process_view_input_new(context, 1);
+    auto ids = process_view_input_new(context, 2);
+
+    // OpenVINO uses reversed GGML dimensions:
+    //   input: [1, n_token, n_used, n_embd]
+    //   bias:  [1, 1, n_expert, n_embd]
+    //   ids:   [1, 1, n_token, n_used]
+    auto bias_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(bias, ov::element::i64);
+    auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
+
+    bias = std::make_shared<ov::op::v1::Reshape>(bias, get_dimensions(bias_shape_4d, {2, 3}), false);
+    ids = std::make_shared<ov::op::v1::Reshape>(ids, get_dimensions(ids_shape_4d, {2, 3}), false);
+
+    if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
+        ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
+    }
+
+    auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
+    ov::Output<ov::Node> selected_bias = std::make_shared<ov::op::v8::Gather>(bias, ids, gather_axis);
+    selected_bias = std::make_shared<ov::op::v1::Reshape>(
+        selected_bias, std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64), false);
+
+    if (selected_bias.get_element_type() != input.get_element_type()) {
+        selected_bias = std::make_shared<ov::op::v0::Convert>(selected_bias, input.get_element_type());
+    }
+
+    ov::Output<ov::Node> res = std::make_shared<ov::op::v1::Add>(input, selected_bias);
+    const auto output_type = context.get_output_type();
+    if (res.get_element_type() != output_type) {
+        res = std::make_shared<ov::op::v0::Convert>(res, output_type);
+    }
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
@@ -0,0 +1,47 @@
+#include "../node_context.h"
+#include "../op_table.h"
+#include "../utils.h"
+#include "ggml.h"
+
+#include <openvino/frontend/exception.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/squeeze.hpp>
+#include <openvino/op/topk.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_argsort(const NodeContext & context) {
+    num_inputs_check(context, 1, 1);
+
+    auto input = process_view_input_new(context, 0);
+
+    const int32_t order = context.get_output_op_params()[0];
+
+    ov::op::v11::TopK::Mode mode;
+    switch (order) {
+    case GGML_SORT_ORDER_ASC:
+        mode = ov::op::v11::TopK::Mode::MIN;
+        break;
+    case GGML_SORT_ORDER_DESC:
+        mode = ov::op::v11::TopK::Mode::MAX;
+        break;
+    default:
+        FRONT_END_OP_CONVERSION_CHECK(false, "Unsupported GGML_OP_ARGSORT order: ", order);
+    }
+
+    auto k = std::make_shared<ov::op::v0::Squeeze>(get_dimensions(input.get_node_shared_ptr(), {3}),
+                                                   ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
+
+    auto topk = std::make_shared<ov::op::v11::TopK>(input, k, 3, mode, ov::op::v11::TopK::SortType::SORT_VALUES,
+                                                    context.get_output_type(), false);
+
+    return rename_outputs_with_suffix({topk->output(1)}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
@@ -0,0 +1,33 @@
+#include "../node_context.h"
+#include "../op_table.h"
+#include "../utils.h"
+
+#include <cstring>
+#include <openvino/op/clamp.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_clamp(const NodeContext & context) {
+    num_inputs_check(context, 1, 1);
+
+    auto input = process_view_input_new(context, 0);
+
+    const int32_t * op_params = context.get_output_op_params();
+    FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "CLAMP requires output op params");
+
+    float min;
+    float max;
+    std::memcpy(&min, reinterpret_cast<const float *>(op_params) + 0, sizeof(float));
+    std::memcpy(&max, reinterpret_cast<const float *>(op_params) + 1, sizeof(float));
+
+    auto res = std::make_shared<ov::op::v0::Clamp>(input, min, max);
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
@@ -0,0 +1,48 @@
+#include "../node_context.h"
+#include "../op_table.h"
+#include "../utils.h"
+
+#include <memory>
+#include <openvino/frontend/exception.hpp>
+#include <openvino/op/concat.hpp>
+#include <openvino/op/convert.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_concat(const NodeContext & context) {
+    num_inputs_check(context, 2, 2);
+
+    const int32_t * op_params = context.get_output_op_params();
+    FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "CONCAT requires output op params");
+
+    const auto output_shape = context.get_output_shape();
+    FRONT_END_CHECK_IMPLEMENTED(output_shape.rank().is_static(), "CONCAT requires static output rank");
+
+    const auto rank = output_shape.rank().get_length();
+    const int32_t ggml_dim = op_params[0];
+    FRONT_END_CHECK_IMPLEMENTED(ggml_dim >= 0 && ggml_dim < rank, "CONCAT axis is out of range");
+
+    auto input_0 = process_view_input_new(context, 0);
+    auto input_1 = process_view_input_new(context, 1);
+    const auto output_type = context.get_output_type();
+
+    if (input_0.get_element_type() != output_type) {
+        input_0 = std::make_shared<ov::op::v0::Convert>(input_0, output_type);
+    }
+    if (input_1.get_element_type() != output_type) {
+        input_1 = std::make_shared<ov::op::v0::Convert>(input_1, output_type);
+    }
+
+    const auto axis = static_cast<int64_t>(rank - 1 - ggml_dim);
+    auto res = std::make_shared<ov::op::v0::Concat>(OutputVector{input_0, input_1}, axis);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
@@ -18,27 +18,19 @@ namespace op {
 OutputVector translate_cont(const NodeContext & context) {
    num_inputs_check(context, 1, 1);

-    int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
-
    auto src_shape = context.get_input_shape(0).to_shape();
    auto dst_shape = context.get_output_shape().to_shape();
-    ov::Output<Node> res;

-    if (op_case == 1) {
-        // The input comes from a PERMUTE
-        throw std::runtime_error("Code of this case might be outdated");
-        dst_shape[1] = -1;
-        res = std::make_shared<ov::op::v1::Reshape>(
-            context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
-    } else if (op_case == 2) {
-        // The input comes from a TRANSPOSE
-        return {context.get_input(0)};
-    } else {
-        // The input comes from a VIEW
-        res = process_view_input(context, 0);
+    if (context.get_op_dynamic_dim() != -1) {
+        dst_shape[3 - context.get_op_dynamic_dim()] = -1;
    }

+    auto input = process_view_input_new(context, 0);
+
+    ov::Output<Node> res;
+    res = std::make_shared<ov::op::v1::Reshape>(
+        input, ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
+
    return rename_outputs_with_suffix({res}, context.get_name());
 }

@@ -3,7 +3,9 @@
 #include "../utils.h"

 #include <memory>
+#include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
+#include <openvino/op/reshape.hpp>

 namespace ov {
 namespace frontend {
@@ -11,7 +13,18 @@ namespace ggml {
 namespace op {

 OutputVector translate_cpy(const NodeContext & context) {
-    auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type());
+    auto input = process_view_input_new(context, 0);
+    auto input_shape = context.get_input_shape(0);
+    auto output_shape = context.get_output_shape();
+
+    // Non-cast CPY may need a reshape (e.g. [3,192,1,1] -> [576,1,1,1])
+    if (input_shape != output_shape) {
+        auto new_shape = ov::op::v0::Constant::create(
+            ov::element::i64, {static_cast<size_t>(output_shape.rank().get_length())}, output_shape.to_shape());
+        input = std::make_shared<ov::op::v1::Reshape>(input, new_shape, false);
+    }
+
+    auto res = std::make_shared<ov::op::v0::Convert>(input, context.get_output_type());
    return rename_outputs_with_suffix({res}, context.get_name());
 }

@@ -0,0 +1,146 @@
+#include "../node_context.h"
+#include "../op_table.h"
+#include "../utils.h"
+#include "ggml.h"
+
+#include <memory>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/divide.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/shape_of.hpp>
+#include <openvino/op/sigmoid.hpp>
+#include <openvino/op/tile.hpp>
+#include <openvino/op/util/precision_sensitive_attribute.hpp>
+#include <vector>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+namespace {
+
+bool is_silu_div_pattern(const ov::Output<ov::Node> & numerator,
+                         const ov::Output<ov::Node> & denominator,
+                         const NodeContext & context) {
+    if (context.get_input_size() != 2) {
+        return false;
+    }
+
+    const auto * unary_op = reinterpret_cast<const ggml_unary_op *>(context.get_input_op_params(0));
+    if (unary_op == nullptr || *unary_op != GGML_UNARY_OP_SILU) {
+        return false;
+    }
+
+    auto mul = std::dynamic_pointer_cast<ov::op::v1::Multiply>(numerator.get_node_shared_ptr());
+    if (!mul) {
+        return false;
+    }
+
+    const auto denom_node = denominator.get_node_shared_ptr();
+    const auto mul_input_0 = mul->input_value(0).get_node_shared_ptr();
+    const auto mul_input_1 = mul->input_value(1).get_node_shared_ptr();
+
+    auto sigmoid = std::dynamic_pointer_cast<ov::op::v0::Sigmoid>(mul_input_1);
+    if (mul_input_0 == denom_node && sigmoid && sigmoid->input_value(0).get_node_shared_ptr() == denom_node) {
+        return true;
+    }
+
+    sigmoid = std::dynamic_pointer_cast<ov::op::v0::Sigmoid>(mul_input_0);
+    return mul_input_1 == denom_node && sigmoid && sigmoid->input_value(0).get_node_shared_ptr() == denom_node;
+}
+
+ov::Output<ov::Node> repeat_input_to_match(const NodeContext & context,
+                                           const ov::Output<ov::Node> & input,
+                                           const ov::Output<ov::Node> & target,
+                                           size_t input_index) {
+    const auto input_shape = context.get_input_shape(input_index);
+    const auto target_shape = context.get_input_shape(0);
+
+    if (input_shape == target_shape) {
+        return input;
+    }
+
+    if (input_shape.rank().is_static() && target_shape.rank().is_static()) {
+        const auto rank = static_cast<size_t>(input_shape.rank().get_length());
+        std::vector<int64_t> repeats(rank, 1);
+        bool needs_repeat = false;
+
+        for (size_t axis = 0; axis < rank; ++axis) {
+            FRONT_END_OP_CONVERSION_CHECK(input_shape[axis].is_static() && target_shape[axis].is_static(),
+                                          "DIV repeat requires static dimensions on both inputs");
+
+            const int64_t input_dim = input_shape[axis].get_length();
+            const int64_t target_dim = target_shape[axis].get_length();
+
+            FRONT_END_OP_CONVERSION_CHECK(input_dim > 0 && target_dim > 0 && target_dim % input_dim == 0,
+                                          "DIV input shape ", input_shape, " cannot repeat to match ", target_shape);
+
+            repeats[axis] = target_dim / input_dim;
+            needs_repeat = needs_repeat || repeats[axis] != 1;
+        }
+
+        if (!needs_repeat) {
+            return input;
+        }
+
+        auto repeats_node = ov::op::v0::Constant::create(ov::element::i64, {repeats.size()}, repeats);
+        return std::make_shared<ov::op::v0::Tile>(input, repeats_node);
+    }
+
+    auto input_shape_node = std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64);
+    auto target_shape_node = std::make_shared<ov::op::v3::ShapeOf>(target, ov::element::i64);
+    auto repeats_node = std::make_shared<ov::op::v1::Divide>(target_shape_node, input_shape_node);
+    return std::make_shared<ov::op::v0::Tile>(input, repeats_node);
+}
+
+}  // namespace
+
+OutputVector translate_div(const NodeContext & context) {
+    num_inputs_check(context, 2, 2);
+
+    auto input_0 = process_view_input_new(context, 0);
+    auto input_1 = process_view_input_new(context, 1);
+
+    if (is_silu_div_pattern(input_0, input_1, context)) {
+        ov::Output<ov::Node> res = std::make_shared<ov::op::v0::Sigmoid>(input_1);
+        if (res.get_element_type() != context.get_output_type()) {
+            res = std::make_shared<ov::op::v0::Convert>(res, context.get_output_type());
+        }
+        return rename_outputs_with_suffix({res}, context.get_name());
+    }
+
+    input_1 = repeat_input_to_match(context, input_1, input_0, 1);
+
+    const auto output_type = context.get_output_type();
+    const bool use_f32_compute = input_0.get_element_type() != ov::element::f32 ||
+                                 input_1.get_element_type() != ov::element::f32 || output_type != ov::element::f32;
+
+    if (use_f32_compute) {
+        input_0 = std::make_shared<ov::op::v0::Convert>(input_0, ov::element::f32);
+        input_1 = std::make_shared<ov::op::v0::Convert>(input_1, ov::element::f32);
+    }
+
+    ov::Output<ov::Node> res = std::make_shared<ov::op::v1::Divide>(input_0, input_1);
+    if (use_f32_compute) {
+        // Keep the reciprocal/divide path in FP32. Without this hint, the GPU
+        // plugin can still compress the subgraph back to FP16 and overflow on
+        // small shexp gate values (e.g. silu(x) / x in qwen2moe).
+        ov::mark_as_precision_sensitive(res.get_node_shared_ptr()->input(0));
+        ov::mark_as_precision_sensitive(res.get_node_shared_ptr()->input(1));
+    }
+    if (res.get_element_type() != output_type) {
+        auto output_convert = std::make_shared<ov::op::v0::Convert>(res, output_type);
+        if (use_f32_compute) {
+            ov::mark_as_precision_sensitive(output_convert->input(0));
+        }
+        res = output_convert;
+    }
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
--- a/Show More
+++ b/Show More