chat: fix an "oldie but goodie" grammar generator bug that surfaced during last changes (#24653 )

* chat: fix an "oldie but goodie" grammar generator bug that surfaced during last changes * update erroneous case in PEG parser test
mtmd : add post-decode callback (#24645 )
2026-06-16 10:46:43 +02:00 · 2026-06-15 15:27:47 +02:00 · 2026-06-15 16:02:05 +03:00 · 2026-06-15 13:19:21 +02:00 · 2026-06-15 10:11:59 +03:00 · 2026-06-15 10:10:53 +03:00
483 changed files with 34260 additions and 12429 deletions
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

@@ -37,7 +37,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -53,7 +53,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl ffmpeg \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -1,10 +1,11 @@
 ARG UBUNTU_VERSION=24.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.8.1
+ARG GCC_VERSION=14
 # Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_DEV_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -12,13 +13,14 @@ ARG APP_REVISION=N/A

 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

+ARG GCC_VERSION
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
+    apt-get install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} build-essential cmake python3 python3-pip git libssl-dev libgomp1

-ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
+ENV CC=gcc-${GCC_VERSION} CXX=g++-${GCC_VERSION} CUDAHOSTCXX=g++-${GCC_VERSION}

 WORKDIR /app

@@ -59,7 +61,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl ffmpeg \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -5,7 +5,7 @@ ARG APP_REVISION=N/A

 ## Build Image

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
+FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 ARG LEVEL_ZERO_VERSION=1.28.2
@@ -42,7 +42,7 @@ RUN mkdir -p /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
+FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -57,11 +57,21 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.url=$IMAGE_URL \
      org.opencontainers.image.source=$IMAGE_SOURCE

-ARG IGC_VERSION=v2.20.5
-ARG IGC_VERSION_FULL=2_2.20.5+19972
-ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
-ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
-ARG IGDGMM_VERSION=22.8.2
+#Following versions are for multiple GPUs, since 26.x has known issue:
+#   https://github.com/ggml-org/llama.cpp/issues/21747,
+#   https://github.com/intel/compute-runtime/issues/921.
+#ARG IGC_VERSION=v2.20.5
+#ARG IGC_VERSION_FULL=2_2.20.5+19972
+#ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
+#ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
+#ARG IGDGMM_VERSION=22.8.2
+
+
+ARG IGC_VERSION=v2.34.4
+ARG IGC_VERSION_FULL=2_2.34.4+21428
+ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
+ARG IGDGMM_VERSION=22.10.0
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
@@ -75,7 +85,7 @@ RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && dpkg --install *.deb

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl ffmpeg \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ascendai/cann:$ASCEND_VERSION AS build
+FROM docker.io/ascendai/cann:$ASCEND_VERSION AS build

 WORKDIR /app

@@ -30,7 +30,7 @@ RUN echo "Building with static libs" && \
    cmake --build build --config Release --target llama-completion

 # TODO: use image with NNRT
-FROM ascendai/cann:$ASCEND_VERSION AS runtime
+FROM docker.io/ascendai/cann:$ASCEND_VERSION AS runtime

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -2,9 +2,9 @@ ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc4.3.0
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_DEV_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_RUN_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -64,7 +64,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl ffmpeg \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -23,7 +23,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ## Build Image
-FROM ubuntu:${UBUNTU_VERSION} AS build
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build

 # Pass proxy args to build stage
 ARG http_proxy
@@ -88,7 +88,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base Runtime Image
-FROM ubuntu:${UBUNTU_VERSION} AS base
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base

 # Pass proxy args to runtime stage
 ARG http_proxy
@@ -107,7 +107,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
+    && apt-get install -y libgomp1 libtbb12 curl wget ffmpeg ocl-icd-libopencl1 \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -5,7 +5,7 @@ ARG ROCM_VERSION=7.2.1
 ARG AMDGPU_VERSION=7.2.1

 # Target the ROCm build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+ARG BASE_ROCM_DEV_CONTAINER=docker.io/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -76,7 +76,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl ffmpeg \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -5,7 +5,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ### Build Llama.cpp stage
-FROM gcc:${GCC_VERSION} AS build
+FROM docker.io/gcc:${GCC_VERSION} AS build

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
@@ -55,7 +55,7 @@ COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion


 ### Base image
-FROM ubuntu:${UBUNTU_VERSION} AS base
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils
@@ -33,7 +33,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -49,7 +49,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
+    && apt-get install -y libgomp1 curl ffmpeg libvulkan1 mesa-vulkan-drivers \
    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
    && apt autoremove -y \
    && apt clean -y \
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
@@ -30,7 +30,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -46,7 +46,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 libnuma1 curl \
+    && apt-get install -y libgomp1 libnuma1 curl ffmpeg \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -12,7 +12,7 @@ SYCL:
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
-Nvidia GPU:
+CUDA:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-cuda.h
@@ -27,8 +27,8 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
+          - { sys: UCRT64,  env: ucrt-x86_64,  compiler: gcc,   build: Release }
+          - { sys: CLANG64, env: clang-x86_64, compiler: clang, build: Release }

    steps:
      - name: Clone
@@ -48,9 +48,7 @@ jobs:
          update: true
          msystem: ${{matrix.sys}}
          install: >-
-            base-devel
-            git
-            mingw-w64-${{matrix.env}}-toolchain
+            mingw-w64-${{matrix.env}}-${{matrix.compiler}}
            mingw-w64-${{matrix.env}}-cmake
            mingw-w64-${{matrix.env}}-openblas

@@ -34,129 +34,108 @@ env:
  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
+  ubuntu-24-sycl:
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON

-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  ubuntu-24-sycl:
-#    strategy:
-#      matrix:
-#        build: [fp32]
-#        include:
-#          - build: fp32
-#            fp16: OFF
-#
-#    runs-on: ubuntu-24.04
-#
-#    env:
-#      ONEAPI_ROOT: /opt/intel/oneapi/
-#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-#      LEVEL_ZERO_VERSION: "1.28.2"
-#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-#
-#    continue-on-error: true
-#
-#    steps:
-#      - uses: actions/checkout@v6
-#
-#      - name: Use oneAPI Installation Cache
-#        uses: actions/cache@v5
-#        id: cache-sycl
-#        with:
-#          path: ${{ env.ONEAPI_ROOT }}
-#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-#
-#      - name: Download & Install oneAPI
-#        shell: bash
-#        if: steps.cache-sycl.outputs.cache-hit != 'true'
-#        run: |
-#          cd /tmp
-#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-#
-#      - name: Install Level Zero SDK
-#        shell: bash
-#        run: |
-#          cd /tmp
-#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-#
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: sycl-ubuntu-24-${{ matrix.build }}
-#          evict-old-files: 1d
-#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-#
-#      - name: Build
-#        id: cmake_build
-#        run: |
-#          source /opt/intel/oneapi/setvars.sh
-#          cmake -B build \
-#            -G "Ninja" \
-#            -DCMAKE_BUILD_TYPE=Release \
-#            -DGGML_SYCL=ON \
-#            -DCMAKE_C_COMPILER=icx \
-#            -DCMAKE_CXX_COMPILER=icpx \
-#            -DLLAMA_OPENSSL=OFF \
-#            -DGGML_NATIVE=OFF \
-#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-#          time cmake --build build --config Release -j $(nproc)
+    runs-on: ubuntu-24.04

-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  windows-latest-sycl:
-#    runs-on: windows-2022
-#
-#    defaults:
-#      run:
-#        shell: bash
-#
-#    env:
-#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: Use oneAPI Installation Cache
-#        uses: actions/cache@v5
-#        id: cache-sycl
-#        with:
-#          path: ${{ env.ONEAPI_ROOT }}
-#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-#
-#      - name: Download & Install oneAPI
-#        shell: bash
-#        if: steps.cache-sycl.outputs.cache-hit != 'true'
-#        run: |
-#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-#
-#      - name: Install Level Zero SDK
-#        shell: pwsh
-#        run: |
-#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: sycl-windows-latest
-#          variant: ccache
-#          evict-old-files: 1d
-#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-#
-#      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-#
-#      - name: Build
-#        id: cmake_build
-#        run:  examples/sycl/win-build-sycl.bat
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+      LEVEL_ZERO_VERSION: "1.28.2"
+      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+
+    continue-on-error: true
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Download & Install oneAPI
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: Install Level Zero SDK
+        shell: bash
+        run: |
+          cd /tmp
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: sycl-ubuntu-24-${{ matrix.build }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Download & Install oneAPI
+        shell: bash
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      - name: Install Level Zero SDK
+        shell: pwsh
+        run: |
+          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: sycl-windows-latest
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
@@ -35,6 +35,29 @@ env:
  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
+  format:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+
+      - name: Install clang-format 22
+        run: |
+          wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key |
+            sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc > /dev/null
+          sudo add-apt-repository -y \
+            "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-22 main"
+          sudo apt-get update
+          sudo apt-get install -y clang-format-22
+
+      - name: Check formatting
+        run: |
+          find ggml/src/ggml-webgpu \
+            -type f \( -name '*.cpp' -o -name '*.hpp' -o -name '*.h' \) \
+            -print0 |
+            xargs -0 clang-format-22 --dry-run --Werror
+
  macos:
    runs-on: macos-latest

@@ -82,8 +82,8 @@ jobs:
            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
@@ -59,8 +59,31 @@ jobs:
            echo "should_release=false" >> $GITHUB_OUTPUT
          fi

+  get-version:
+    runs-on: ubuntu-slim
+    outputs:
+      ui_version: ${{ steps.version.outputs.ui_version }}
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - id: version
+        run: |
+          # Resolve UI version: BUILD_NUMBER from cmake/build-info.cmake > git hash + epoch > fallback
+          version=""
+          if grep -q "BUILD_NUMBER" cmake/build-info.cmake; then
+            build_number=$(grep "set(BUILD_NUMBER" cmake/build-info.cmake | grep -oP '\d+')
+            if [ -n "$build_number" ] && [ "$build_number" -gt 0 ]; then
+              version="b${build_number}"
+            fi
+          fi
+          if [ -z "$version" ]; then
+            version=$(git rev-parse --short HEAD)-$(date +%s)
+          fi
+          echo "ui_version=${version}" >> $GITHUB_OUTPUT
+
  macos-cpu:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    strategy:
      matrix:
@@ -116,6 +139,7 @@ jobs:
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

@@ -141,7 +165,7 @@ jobs:
          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    strategy:
      matrix:
@@ -201,6 +225,7 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -227,7 +252,7 @@ jobs:
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-vulkan:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    strategy:
@@ -287,6 +312,7 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DGGML_VULKAN=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -312,7 +338,7 @@ jobs:
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  android-arm64:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-latest
@@ -379,6 +405,7 @@ jobs:
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -404,7 +431,7 @@ jobs:
          name: llama-bin-android-arm64.tar.gz

  ubuntu-24-openvino:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-24.04
@@ -476,7 +503,8 @@ jobs:
          source ./openvino_toolkit/setupvars.sh
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
+            -DGGML_OPENVINO=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
          cmake --build build/ReleaseOV --config Release -j $(nproc)

      - name: ccache-clear
@@ -504,7 +532,7 @@ jobs:
    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

-    runs-on: windows-2025
+    runs-on: windows-2025-vs2026

    permissions:
      actions: write
@@ -535,12 +563,12 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: release-windows-2025-${{ matrix.arch }}-cpu
+          key: release-windows-2025-vs2026-${{ matrix.arch }}-cpu

      - name: Build
        shell: cmd
        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
+          call "C:\Program Files\Microsoft Visual Studio\18\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
            -DLLAMA_BUILD_BORINGSSL=ON ^
@@ -554,12 +582,12 @@ jobs:
      - name: ccache-clear
        uses: ./.github/actions/ccache-clear
        with:
-          key: release-windows-2025-${{ matrix.arch }}-cpu
+          key: release-windows-2025-vs2026-${{ matrix.arch }}-cpu

      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
+          Copy-Item "C:\Program Files\Microsoft Visual Studio\18\Enterprise\VC\Redist\MSVC\14.51.36231\debug_nonredist\${{ matrix.arch }}\Microsoft.VC145.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
          7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*

      - name: Upload artifacts
@@ -754,213 +782,209 @@ jobs:
          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  windows-sycl:
-#
-#    runs-on: windows-2022
-#
-#    defaults:
-#      run:
-#        shell: bash
-#
-#    env:
-#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: Use oneAPI Installation Cache
-#        uses: actions/cache@v5
-#        id: cache-sycl
-#        with:
-#          path: ${{ env.ONEAPI_ROOT }}
-#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-#
-#      - name: Download & Install oneAPI
-#        shell: bash
-#        if: steps.cache-sycl.outputs.cache-hit != 'true'
-#        run: |
-#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-#
-#      - name: Install Level Zero SDK
-#        shell: pwsh
-#        run: |
-#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-#
-#      - name: Setup Node.js
-#        uses: actions/setup-node@v6
-#        with:
-#          node-version: "24"
-#          cache: "npm"
-#          cache-dependency-path: "tools/ui/package-lock.json"
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: release-windows-2022-x64-sycl
-#
-#      - name: Build
-#        id: cmake_build
-#        shell: cmd
-#        run: |
-#          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-#          cmake -G "Ninja" -B build ^
-#            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-#            -DCMAKE_BUILD_TYPE=Release ^
-#            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-#            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-#            -DLLAMA_BUILD_BORINGSSL=ON
-#          cmake --build build --target ggml-sycl -j
-#
-#      - name: Build the release package
-#        id: pack_artifacts
-#        run: |
-#          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-#
-#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-#
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-#          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
-#          if [ -n "$ZE_LOADER_DLL" ]; then
-#            echo "Using Level Zero loader: $ZE_LOADER_DLL"
-#            cp "$ZE_LOADER_DLL" ./build/bin
-#          else
-#            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
-#          fi
-#
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
-#
-#          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-#
-#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
-#
-#          echo "cp oneAPI running time dll files to ./build/bin done"
-#          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
-#
-#      - name: Upload the release package
-#        uses: actions/upload-artifact@v6
-#        with:
-#          path: llama-bin-win-sycl-x64.zip
-#          name: llama-bin-win-sycl-x64.zip
+  windows-sycl:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}

-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  ubuntu-24-sycl:
-#
-#    strategy:
-#      matrix:
-#        build: [fp32]
-#        include:
-#          - build: fp32
-#            fp16: OFF
-#
-#    runs-on: ubuntu-24.04
-#
-#    env:
-#      ONEAPI_ROOT: /opt/intel/oneapi/
-#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-#      LEVEL_ZERO_VERSION: "1.28.2"
-#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#        with:
-#          fetch-depth: 0
-#
-#      - name: Use oneAPI Installation Cache
-#        uses: actions/cache@v5
-#        id: cache-sycl
-#        with:
-#          path: ${{ env.ONEAPI_ROOT }}
-#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-#
-#      - name: Download & Install oneAPI
-#        shell: bash
-#        if: steps.cache-sycl.outputs.cache-hit != 'true'
-#        run: |
-#          cd /tmp
-#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-#
-#      - name: Install Level Zero SDK
-#        shell: bash
-#        run: |
-#          cd /tmp
-#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-#
-#      - name: Setup Node.js
-#        uses: actions/setup-node@v6
-#        with:
-#          node-version: "24"
-#          cache: "npm"
-#          cache-dependency-path: "tools/ui/package-lock.json"
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: release-ubuntu-24.04-sycl
-#
-#      - name: Build
-#        id: cmake_build
-#        run: |
-#          source /opt/intel/oneapi/setvars.sh
-#          cmake -B build \
-#            -G "Ninja" \
-#            -DCMAKE_BUILD_TYPE=Release \
-#            -DGGML_SYCL=ON \
-#            -DCMAKE_C_COMPILER=icx \
-#            -DCMAKE_CXX_COMPILER=icpx \
-#            -DLLAMA_OPENSSL=OFF \
-#            -DGGML_NATIVE=OFF \
-#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-#          time cmake --build build --config Release -j $(nproc)
-#
-#      - name: Determine tag name
-#        id: tag
-#        uses: ./.github/actions/get-tag-name
-#
-#      - name: Pack artifacts
-#        id: pack_artifacts
-#        run: |
-#          cp LICENSE ./build/bin/
-#          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-#
-#      - name: Upload artifacts
-#        uses: actions/upload-artifact@v6
-#        with:
-#          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-#          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Download & Install oneAPI
+        shell: bash
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      - name: Install Level Zero SDK
+        shell: pwsh
+        run: |
+          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2022-x64-sycl
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        run: |
+          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+          cmake -G "Ninja" -B build ^
+            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
+            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
+            -DLLAMA_BUILD_BORINGSSL=ON
+          cmake --build build --target ggml-sycl -j %NUMBER_OF_PROCESSORS%
+
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-x64-sycl
+
+      - name: Build the release package
+        id: pack_artifacts
+        run: |
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
+          if [ -n "$ZE_LOADER_DLL" ]; then
+            echo "Using Level Zero loader: $ZE_LOADER_DLL"
+            cp "$ZE_LOADER_DLL" ./build/bin
+          else
+            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
+          fi
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
+
+          echo "cp oneAPI running time dll files to ./build/bin done"
+          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
+
+      - name: Upload the release package
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-bin-win-sycl-x64.zip
+          name: llama-bin-win-sycl-x64.zip
+
+  ubuntu-24-sycl:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
+
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON
+
+    runs-on: ubuntu-24.04
+
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+      LEVEL_ZERO_VERSION: "1.28.2"
+      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Download & Install oneAPI
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: Install Level Zero SDK
+        shell: bash
+        run: |
+          cd /tmp
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-ubuntu-24.04-sycl-${{ matrix.build }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-ubuntu-24.04-sycl-${{ matrix.build }}
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz

  ubuntu-22-rocm:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-22.04
@@ -1052,6 +1076,7 @@ jobs:
            -DGGML_HIP=ON \
            -DHIP_PLATFORM=amd \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -1080,7 +1105,7 @@ jobs:
          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: windows-2022
@@ -1176,6 +1201,7 @@ jobs:
            -DGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} `
            -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
@@ -1203,7 +1229,7 @@ jobs:
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip

  ios-xcode:
-    needs: [check-release]
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    runs-on: macos-26

@@ -1232,7 +1258,8 @@ jobs:
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

      - name: xcodebuild for swift package
@@ -1352,10 +1379,12 @@ jobs:
 #          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 #          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz

-  ui:
-    needs: [check-release]
+  ui-build:
+    needs: [check-release, get-version]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    uses: ./.github/workflows/ui-build.yml
+    with:
+      hf_ui_version: ${{ needs.get-version.outputs.ui_version }}

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -1368,6 +1397,7 @@ jobs:
    runs-on: ubuntu-slim

    needs:
+      - get-version
      - windows
      - windows-cpu
      - windows-cuda
@@ -1382,7 +1412,7 @@ jobs:
      - macos-cpu
      - ios-xcode
      #- openEuler-cann
-      - ui
+      - ui-build

    outputs:
      tag_name: ${{ steps.tag.outputs.name }}
@@ -1482,7 +1512,8 @@ jobs:
            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-            - Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
+            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
+            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)

            **Android:**
            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
@@ -1493,7 +1524,7 @@ jobs:
            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
-            - Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
+            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)

            **openEuler:**
@@ -28,13 +28,6 @@ jobs:
        run: npm run build
        working-directory: tools/ui

-      - name: Generate checksums
-        run: |
-          cd tools/ui/dist
-          for f in *; do
-            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
-          done
-
      - name: Upload built UI
        uses: actions/upload-artifact@v6
        with:
@@ -2,6 +2,11 @@ name: UI Build

 on:
  workflow_call:
+    inputs:
+      hf_ui_version:
+        description: 'Version string for version.json (e.g. 12345)'
+        required: false
+        type: string

 jobs:
  build:
@@ -25,15 +30,15 @@ jobs:
        working-directory: tools/ui

      - name: Build application
+        env:
+          HF_UI_VERSION: ${{ inputs.hf_ui_version || '' }}
+          LLAMA_BUILD_NUMBER: ${{ inputs.hf_ui_version || 'b0000' }}
        run: npm run build
        working-directory: tools/ui

-      - name: Generate checksums
-        run: |
-          cd tools/ui/dist
-          for f in *; do
-            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
-          done
+      - name: Run PWA unit tests (versioned build output)
+        run: npx vitest --project=unit --run tests/unit/pwa.spec.ts
+        working-directory: tools/ui

      - name: Upload built UI
        uses: actions/upload-artifact@v6
@@ -40,6 +40,12 @@ jobs:
          name: ui-build
          path: tools/ui/dist/

+      - name: Create distribution archive
+        run: |
+          tar -czf dist.tar.gz -C tools/ui/dist .
+          sha256sum dist.tar.gz > dist.tar.gz.sha256
+          mv dist.tar.gz dist.tar.gz.sha256 tools/ui/dist/
+
      - name: Install Hugging Face Hub CLI
        run: pip install -U huggingface_hub

@@ -1,8 +1,8 @@
 name: UI (self-hosted)

 # these are the same as ui.yml, but with self-hosted runners
-# the runners come with pre-installed Playwright browsers version: 1.56.1
-# the jobs are much lighter because they don't need to install node and playwright browsers
+# the jobs are lighter because they don't need to install Node.js or Playwright browsers
+# the runner has pre-installed Playwright browsers for @playwright/test (1.56.1) at /ms-playwright/

 on:
  workflow_dispatch:
@@ -61,6 +61,12 @@ jobs:
        run: npm ci
        working-directory: tools/ui

+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+
      - name: Run type checking
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run check
@@ -72,12 +78,12 @@ jobs:
        working-directory: tools/ui

      - name: Run Client tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:client
        working-directory: tools/ui

      - name: Run Unit tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:unit
        working-directory: tools/ui

@@ -97,22 +103,23 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/ui
+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/

      - name: Build Storybook
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run build-storybook
        working-directory: tools/ui

      - name: Run UI tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/ui

      - name: Run E2E tests
-        if: ${{ always() }}
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/ui
@@ -43,7 +43,7 @@ jobs:
  ui-checks:
    name: Checks
    needs: ui-build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -60,6 +60,12 @@ jobs:
          cache: "npm"
          cache-dependency-path: "tools/ui/package-lock.json"

+      - name: Download built UI artifacts
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+
      - name: Install dependencies
        id: setup
        if: ${{ steps.node.conclusion == 'success' }}
@@ -87,7 +93,7 @@ jobs:
        run: npm run test:client
        working-directory: tools/ui

-      - name: Run Unit tests
+      - name: Run Unit tests (uses pre-built dist/ from ui-build)
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:unit
        working-directory: tools/ui
@@ -95,7 +101,7 @@ jobs:
  e2e-tests:
    name: E2E Tests
    needs: ui-build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
@@ -117,10 +123,11 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/ui
+      - name: Download built UI artifacts (reuses ui-build)
+        uses: actions/download-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/

      - name: Install Playwright browsers
        id: playwright
@@ -138,7 +145,7 @@ jobs:
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/ui

-      - name: Run E2E tests
+      - name: Run E2E tests (uses pre-built dist/ from ui-build)
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/ui
@@ -17,7 +17,7 @@ jobs:

      - name: Install komac
        run: |
-          cargo binstall komac@2.15.0 -y
+          cargo binstall komac@2.16.0 -y

      - name: Find latest release
        id: find_latest_release
@@ -92,13 +92,6 @@
 !/examples/sycl/*.bat
 !/examples/sycl/*.sh

-# Server Web UI temporary files (+ legacy directory)
-
-/tools/server/webui/node_modules
-/tools/server/webui/dist
-/tools/ui/node_modules
-/tools/ui/dist
-
 # Python

 /.venv
@@ -16,12 +16,12 @@ Pull requests (PRs):
 - New branch names are prefixed with "gg/"
 - Before opening a pull request, ask the user to confirm the description
 - When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
+- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]"
 - Ask the user to tell you what model was used and write it in place of [MODEL]
 - Always create the pull requests in draft mode

 Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
+- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag
 - Do not explicitly set the git author in commits - rely on the default git config
 - Always use `--no-gpg-sign` when committing
 - Never `git push` without explicit confirmation from the user
@@ -1,6 +1,6 @@
 # llama.cpp

-![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
+![llama](https://raw.githubusercontent.com/ggml-org/llama.brand/refs/heads/master/cover/llama-cpp/cover-llama-cpp-dark.svg)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
@@ -78,6 +78,8 @@ add_library(${TARGET}
    hf-cache.cpp
    hf-cache.h
    http.h
+    imatrix-loader.cpp
+    imatrix-loader.h
    json-partial.cpp
    json-partial.h
    json-schema-to-grammar.cpp
@@ -444,7 +444,13 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
    opts.offline         = params.offline;
    opts.skip_download   = params.skip_download;
    opts.download_mtp    = spec_type_draft_mtp;
-    opts.download_mmproj = !params.no_mmproj;
+    opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
+
+    // sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
+    // so we should not auto-discover mtp/mmproj siblings for them
+    common_download_opts sub_opts = opts;
+    sub_opts.download_mtp    = false;
+    sub_opts.download_mmproj = false;

    try {
        auto res = common_params_handle_model(params.model, opts);
@@ -457,7 +463,7 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
        // only download mmproj if the current example is using it
        for (const auto & ex : mmproj_examples) {
            if (curr_ex == ex) {
-                common_params_handle_model(params.mmproj, opts);
+                common_params_handle_model(params.mmproj, sub_opts);
                break;
            }
        }
@@ -470,8 +476,8 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
            params.speculative.draft.mparams.url.empty()) {
            params.speculative.draft.mparams.path = res.mtp.path;
        }
-        common_params_handle_model(params.speculative.draft.mparams, opts);
-        common_params_handle_model(params.vocoder.model,             opts);
+        common_params_handle_model(params.speculative.draft.mparams, sub_opts);
+        common_params_handle_model(params.vocoder.model,             sub_opts);
        return true;
    } catch (const common_skip_download_exception &) {
        return false;
@@ -1354,7 +1360,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"--cache-idle-slots"},
        {"--no-cache-idle-slots"},
-        "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
+        "save idle slots to the prompt cache on new task, and clear them when using unified KV (default: enabled, requires cache-ram)",
        [](common_params & params, bool value) {
            params.cache_idle_slots = value;
        }
@@ -1609,7 +1615,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
        [](common_params & params, const std::string & value) {
            const auto sampler_names = string_split<std::string>(value, ';');
-            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
+            params.sampling.samplers = common_sampler_types_from_names(sampler_names);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
        }
    ).set_sampling());
@@ -2215,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
-        {"--image", "--audio"}, "FILE",
-        "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
+        {"--image", "--audio", "--video"}, "FILE",
+        "path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n",
        [](common_params & params, const std::string & value) {
            for (const auto & item : parse_csv_row(value)) {
                params.image.emplace_back(item);
@@ -2237,6 +2243,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.image_max_tokens = value;
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
+    add_opt(common_arg(
+        {"--mtmd-batch-max-tokens"}, "N",
+        string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
+        [](common_params & params, int value) {
+            params.mtmd_batch_max_tokens = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
@@ -3327,6 +3340,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            common_log_set_file(common_log_main(), value.c_str());
        }
    ).set_env("LLAMA_ARG_LOG_FILE"));
+    add_opt(common_arg(
+        {"--log-prompts-dir"}, "PATH",
+        "Log prompts to directory (only used for debugging, default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.path_prompts_log_dir = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
        {"--log-colors"}, "[on|off|auto]",
        "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -134,7 +134,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs, cons
            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
            parser = ctx.reasoning_parser + p.space() + p.choice({
                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
-                response_format
+                p.space() + response_format  + p.space()
            }) + p.end();
            pure_content = false;
        } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
@@ -393,8 +393,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                           (schema_info.resolves_to_string(param_schema) ?
                                p.tool_arg_string_value(until_suffix) :
                                p.tool_arg_json_value(p.schema(
-                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
-                                    p.space()) +
+                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false))) +
                           p.tool_arg_close(p.literal(arguments.value_suffix)));

            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
@@ -1229,8 +1229,8 @@ void analyze_tools::extract_argument_name_markers() {
            left_result.tags["pre"] == right_result.tags["pre"] &&
            left_result.tags["suffix"] == right_result.tags["suffix"]) {
            // Name is inside a structure (e.g., JSON key): prefix is the shared wrapper
-            arguments.name_prefix = trim_whitespace(left_result.tags["pre"]);
-            arguments.name_suffix = trim_leading_whitespace(left_result.tags["suffix"]);
+            arguments.name_prefix = left_result.tags["pre"];
+            arguments.name_suffix = left_result.tags["suffix"];
        } else if (diff.left.substr(0, ARG_FIRST.length()) == ARG_FIRST && diff.right.substr(0, ARG_SECOND.length()) == ARG_SECOND) {
            // Name is directly in the diff: prefix comes from last marker in diff.prefix
            auto pre_parser = build_tagged_peg_parser([&](common_peg_parser_builder & p) {
@@ -1315,8 +1315,7 @@ void analyze_tools::extract_argument_value_markers() {
                value_suffix = value_suffix.substr(0, end_marker_pos);
            }
        }
-        value_suffix = trim_leading_whitespace(value_suffix);
-        if (!value_suffix.empty()) {
+        if (!trim_whitespace(value_suffix).empty()) {
            arguments.value_suffix = value_suffix;
        }
    }
@@ -87,6 +87,8 @@ static std::string normalize_quotes_to_json(const std::string & input) {
    bool in_single_quoted = false;
    bool in_double_quoted = false;

+    auto is_word_char = [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; };
+
    for (size_t i = 0; i < input.size(); ++i) {
        char c = input[i];

@@ -151,6 +153,29 @@ static std::string normalize_quotes_to_json(const std::string & input) {
                in_single_quoted = true;
                result += '"';
            }
+        } else if (!in_single_quoted && !in_double_quoted && (c == 'T' || c == 'F' || c == 'N') &&
+                   (i == 0 || !is_word_char(input[i - 1]))) {
+            // Python literals -> JSON; prefix match keeps streamed partials monotonic.
+            static constexpr std::pair<std::string_view, std::string_view> literals[] = {
+                { "True", "true" }, { "False", "false" }, { "None", "null" },
+            };
+            size_t n = 0;
+            while (i + n < input.size() && is_word_char(input[i + n])) {
+                ++n;
+            }
+            std::string_view token(input.data() + i, n);
+            bool matched = false;
+            for (const auto & [py, js] : literals) {
+                if (py.substr(0, n) == token) {
+                    result += js.substr(0, n);
+                    i += n - 1;
+                    matched = true;
+                    break;
+                }
+            }
+            if (!matched) {
+                result += c;
+            }
        } else {
            result += c;
        }
@@ -338,7 +363,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
    }

    if ((is_arg_value || is_arg_string_value) && current_tool) {
-        std::string value_content = std::string(trim_trailing_space(trim_leading_space(node.text, 1), 1));
+        std::string value_content = std::string(node.text);

        std::string value_to_add;
        if (value_content.empty() && is_arg_string_value) {
@@ -353,12 +378,8 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
            }
            value_to_add += escape_json_string_inner(value_content);
        } else if (!value_content.empty()) {
-            // For potential containers, normalize Python-style single quotes to JSON double quotes
-            bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
-            if (is_potential_container) {
-                value_content = normalize_container_value(value_content);
-            }
-            value_to_add += value_content;
+            // Pythonic scalars/containers -> JSON.
+            value_to_add += normalize_container_value(value_content);
        }

        args_target() += value_to_add;
@@ -466,11 +487,34 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
    return force_tool_calls ? section : optional(section);
 }

+// Like python_value(), but the leaf also accepts JSON-cased true/false/null, used by LFM2/LFM2.5
+common_peg_parser common_chat_peg_builder::python_or_json_value() {
+    return rule("python-or-json-value", [this]() {
+        auto ws    = space();
+        auto value = python_or_json_value();
+
+        auto member  = sequence({ python_string(), ws, literal(":"), ws, value });
+        auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) });
+        auto dict    = rule("python-or-json-dict", [&]() {
+            return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }), ws });
+        });
+
+        auto elements = sequence({ value, zero_or_more(sequence({ literal(","), ws, value })) });
+        auto array    = rule("python-or-json-array", [&]() {
+            return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }), ws });
+        });
+
+        return choice({ dict, array, python_string(), python_number(),
+                        python_bool(), python_null(), json_bool(), json_null() });
+    });
+}
+
 // Python-style tool calls: name(arg1="value1", arg2=123)
 // Used only by LFM2 for now, so we don't merge it into autoparser
 common_peg_parser common_chat_peg_builder::python_style_tool_calls(
    const ordered_json & tools,
-    bool                 parallel_tool_calls) {
+    bool                 parallel_tool_calls,
+    bool                 allow_json_literals) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
@@ -504,7 +548,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
                if (is_string_type) {
                    arg_value_parser = string_value_parser;
                } else {
-                    arg_value_parser = tool_arg_value(python_value());
+                    arg_value_parser = tool_arg_value(allow_json_literals ? python_or_json_value() : python_value());
                }

                // Full argument: name="value" or name=value
@@ -132,9 +132,13 @@ class common_chat_peg_builder : public common_peg_parser_builder {
    // Helper for Python-style function call format: name(arg1="value1", arg2=123)
    // Used by LFM2 and similar templates
    common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
-                                              bool                           parallel_tool_calls);
+                                              bool                           parallel_tool_calls,
+                                              bool                           allow_json_literals);

  private:
+    // Python values plus JSON true/false/null.
+    common_peg_parser python_or_json_value();
+
    // Implementation helpers for standard_json_tools — one per JSON tool call layout mode
    common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
                                                       const std::string &            args_key,
@@ -195,4 +199,3 @@ struct tagged_peg_parser {

 tagged_peg_parser build_tagged_peg_parser(
    const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
-
@@ -1608,42 +1608,52 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
    return data;
 }

-// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
-// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
-// - Reasoning: <think>{reasoning}</think> (optional)
-// - Content: text before a tool call (optional)
-// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
-//   Tool calls can appear multiple times (parallel tool calls supported)
-static common_chat_params common_chat_params_init_lfm2(const common_chat_template &    tmpl,
-                                                       const autoparser::generation_params & inputs) {
+// LFM2/LFM2.5 parser. Tool calls are almost Python-style and parallel-capable
+// (except dotted names and JSON literals true/false/null).
+// Always wrapped in <|tool_call_start|>[name(args)]<|tool_call_end|> with optional <think> reasoning.
+// tool_list_tokens preserves LFM2 system tool-list markers.
+static common_chat_params common_chat_params_init_lfm2(const common_chat_template &          tmpl,
+                                                       const autoparser::generation_params & inputs,
+                                                       bool tool_list_tokens) {
    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking = true;
-    data.preserved_tokens  = {
-        "<|tool_list_start|>",
-        "<|tool_list_end|>",
-        "<|tool_call_start|>",
-        "<|tool_call_end|>",
-        "<think>",
-        "</think>",
-    };
-
-    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
-
    const std::string TOOL_CALL_START = "<|tool_call_start|>";
    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
+    const std::string TOOL_LIST_START = "<|tool_list_start|>";
+    const std::string TOOL_LIST_END   = "<|tool_list_end|>";
    const std::string THINK_START     = "<think>";
    const std::string THINK_END       = "</think>";
    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";

+    // Copy reasoning to the "thinking" field the template expects
+    auto adjusted_messages = json::array();
+    for (auto msg : inputs.messages) {
+        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
+            msg["thinking"] = msg.at("reasoning_content");
+        }
+        adjusted_messages.push_back(msg);
+    }
+
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs, adjusted_messages);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, adjusted_messages);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking = true;
+    data.preserved_tokens  = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END };
+    if (tool_list_tokens) {
+        data.preserved_tokens.push_back(TOOL_LIST_START);
+        data.preserved_tokens.push_back(TOOL_LIST_END);
+    }
+
    data.thinking_start_tag = THINK_START;
    data.thinking_end_tag   = THINK_END;

+    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
+    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
+    // Gate by reasoning format and whether the template supports <think>
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
+                             tmpl.source().find(THINK_START) != std::string::npos;
+    auto include_grammar   = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
+
    if (inputs.has_continuation()) {
        const auto & msg = inputs.continue_msg;

@@ -1660,17 +1670,21 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
        auto end = p.end();

        auto reasoning = p.eps();
-        if (extract_reasoning && inputs.enable_thinking) {
+        if (extract_reasoning) {
            reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
        }

        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+            if (has_response_format) {
+                auto response_format = p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema));
+                return generation_prompt + reasoning + response_format + end;
+            }
            return generation_prompt + reasoning + p.content(p.rest()) + end;
        }
        auto tool_calls = p.rule("tool-calls",
            p.trigger_rule("tool-call",
                p.literal(TOOL_CALL_START) +
-                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
+                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls, /* allow_json_literals = */ true) +
                p.literal(TOOL_CALL_END)
            )
        );
@@ -1683,13 +1697,17 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    data.parser = parser.save();

    if (include_grammar) {
-        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
                auto         schema   = function.at("parameters");
                builder.resolve_refs(schema);
            });
+            if (has_response_format) {
+                auto schema = inputs.json_schema;
+                builder.resolve_refs(schema);
+            }
            parser.build_grammar(builder, data.grammar_lazy);
        });

@@ -1697,93 +1715,6 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
        };
    }
-    return data;
-}
-
-// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
-// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
-// - Reasoning: <think>{reasoning}</think> (optional)
-// - Content: text before a tool call (optional)
-// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
-//   Tool calls can appear multiple times (parallel tool calls supported)
-static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template &    tmpl,
-                                                         const autoparser::generation_params & inputs) {
-    common_chat_params data;
-
-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking = true;
-    data.preserved_tokens  = {
-        "<|tool_call_start|>",
-        "<|tool_call_end|>",
-        "<think>",
-        "</think>",
-    };
-
-    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
-
-    const std::string THINK_START     = "<think>";
-    const std::string THINK_END       = "</think>";
-    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";
-
-    data.thinking_start_tag = THINK_START;
-    data.thinking_end_tag   = THINK_END;
-
-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += THINK_END + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
-    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.literal(GEN_PROMPT);
-        auto end = p.end();
-
-        auto reasoning = p.eps();
-        if (extract_reasoning && inputs.enable_thinking) {
-            reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
-        }
-
-        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            return generation_prompt + reasoning + p.content(p.rest()) + end;
-        }
-
-        auto tool_calls = p.rule("tool-calls",
-            p.trigger_rule("tool-call",
-                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
-            )
-        );
-
-        auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
-        auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
-        return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
-    });
-
-    data.parser = parser.save();
-
-    if (include_grammar) {
-        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
-        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                auto         schema   = function.at("parameters");
-                builder.resolve_refs(schema);
-            });
-            parser.build_grammar(builder, data.grammar_lazy);
-        });
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const std::string name = tool.at("function").at("name");
-            data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
-        });
-    }

    return data;
 }
@@ -2048,6 +1979,146 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
    return data;
 }

+// Cohere2 MoE (a.k.a. "North Code") parser.
+//
+// The assistant turn is fully marker-wrapped:
+//   <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+//     <|START_THINKING|>{reasoning}<|END_THINKING|>
+//     then EITHER content:    <|START_TEXT|>{content}<|END_TEXT|>
+//          OR     tool calls: <|START_ACTION|>[
+//                                 {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ...
+//                             ]<|END_ACTION|>
+//   <|END_OF_TURN_TOKEN|>
+//
+// The generation prompt forces a leading <|START_THINKING|> (when reasoning is enabled, which is
+// the template default), so the model's output continues from *inside* the thinking block. The
+// parser literal therefore only covers the stable <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> prefix
+// and the reasoning rule consumes the <|START_THINKING|> ... <|END_THINKING|> markers itself,
+// regardless of whether they came from the generation prompt or the generated text.
+static common_chat_params common_chat_params_init_cohere2moe(const common_chat_template &          tmpl,
+                                                              const autoparser::generation_params & inputs) {
+    common_chat_params data;
+
+    const std::string TURN_START    = "<|START_OF_TURN_TOKEN|>";
+    const std::string TURN_END      = "<|END_OF_TURN_TOKEN|>";
+    const std::string CHATBOT       = "<|CHATBOT_TOKEN|>";
+    const std::string USER          = "<|USER_TOKEN|>";
+    const std::string SYSTEM        = "<|SYSTEM_TOKEN|>";
+    const std::string THINK_START   = "<|START_THINKING|>";
+    const std::string THINK_END     = "<|END_THINKING|>";
+    const std::string TEXT_START    = "<|START_TEXT|>";
+    const std::string TEXT_END      = "<|END_TEXT|>";
+    const std::string ACTION_START  = "<|START_ACTION|>";
+    const std::string ACTION_END    = "<|END_ACTION|>";
+    const std::string RESULT_START  = "<|START_TOOL_RESULT|>";
+    const std::string RESULT_END    = "<|END_TOOL_RESULT|>";
+
+    // Stable prefix of the generation prompt that precedes the (forced) <|START_THINKING|> marker.
+    const std::string GEN_PREFIX = TURN_START + CHATBOT;
+
+    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt  = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking  = true;
+    data.thinking_start_tag = THINK_START;
+    data.thinking_end_tag   = THINK_END;
+    data.preserved_tokens   = {
+        TURN_START, TURN_END, CHATBOT, USER, SYSTEM,
+        THINK_START, THINK_END,
+        TEXT_START, TEXT_END,
+        ACTION_START, ACTION_END,
+        RESULT_START, RESULT_END,
+    };
+
+    // Split the rendered prompt into per-role message spans. Tool results are rendered with the
+    // system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
+    // the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "assistant", GEN_PREFIX },
+        { "user",      TURN_START + USER },
+        { "tool",      TURN_START + SYSTEM + RESULT_START },
+        { "system",    TURN_START + SYSTEM },
+    });
+
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+
+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PREFIX + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + TEXT_START + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto generation_prompt = p.literal(GEN_PREFIX);
+        auto end               = p.end();
+
+        // The thinking block is always present (the generation prompt forces <|START_THINKING|>).
+        // When extracting reasoning, capture its body; otherwise keep the whole block (markers
+        // included) inline as content, matching reasoning_format=NONE conventions.
+        common_peg_parser reasoning = p.eps();
+        if (extract_reasoning) {
+            reasoning = p.optional(p.literal(THINK_START) +
+                                   p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) +
+                                   p.optional(p.literal(THINK_END)));
+        } else {
+            reasoning = p.optional(p.content(p.literal(THINK_START) +
+                                             p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) +
+                                             p.optional(p.literal(THINK_END))));
+        }
+
+        auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END));
+
+        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+            return generation_prompt + reasoning + text_content + p.optional(p.literal(TURN_END)) + end;
+        }
+
+        auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+        // <|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ... ]<|END_ACTION|>
+        auto tool_calls = p.standard_json_tools(ACTION_START, ACTION_END, inputs.tools, inputs.parallel_tool_calls,
+                                                /* force_tool_calls = */ true,
+                                                /* name_key         = */ "tool_name",
+                                                /* args_key         = */ "parameters",
+                                                /* array_wrapped    = */ true,
+                                                /* function_is_key  = */ false,
+                                                /* call_id_key      = */ "",
+                                                /* gen_call_id_key  = */ "tool_call_id",
+                                                /* parameters_order = */ { "tool_call_id", "tool_name", "parameters" });
+
+        // Content and tool calls are mutually exclusive in this format.
+        common_peg_parser body = require_tools ? tool_calls : p.choice({ tool_calls, text_content });
+
+        return generation_prompt + reasoning + body + p.optional(p.literal(TURN_END)) + end;
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto         schema   = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+
+        data.grammar_triggers = {
+            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ACTION_START }
+        };
+    }
+
+    return data;
+}
+
 namespace workaround {

 static void map_developer_role_to_system(json & messages) {
@@ -2296,16 +2367,25 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

+    // Cohere2 MoE / North Code - marker-wrapped format with <|START_TEXT|> content and
+    // <|START_ACTION|> JSON tool calls. <|START_TEXT|> is unique to this template (the older
+    // Command-R templates use <|START_RESPONSE|>).
+    if (src.find("<|START_TEXT|>") != std::string::npos &&
+        src.find("<|START_ACTION|>") != std::string::npos) {
+        LOG_DBG("Using specialized template: Cohere2 MoE\n");
+        return common_chat_params_init_cohere2moe(tmpl, params);
+    }
+
    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
-        return common_chat_params_init_lfm2(tmpl, params);
+        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
    }

    // LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
    if (src.find("List of tools: [") != std::string::npos &&
        src.find("<|tool_list_start|>") == std::string::npos) {
        LOG_DBG("Using specialized template: LFM2.5\n");
-        return common_chat_params_init_lfm2_5(tmpl, params);
+        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ false);
    }

    // GigaChatV3 format detection
@@ -1148,7 +1148,7 @@ static void common_init_sampler_from_model(
        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
            if (!sampler_names.empty()) {
-                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+                sparams.samplers = common_sampler_types_from_names(sampler_names);
            }
        }
    }
@@ -489,6 +489,7 @@ struct common_params {
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string path_prompts_log_dir = ""; // directory with logged prompts                                 // NOLINT

    // llama-debug specific options
    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
@@ -571,9 +572,10 @@ struct common_params {
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
-    std::vector<std::string> image; // path to image file(s)
+    std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
    int image_min_tokens = -1;
    int image_max_tokens = -1;
+    int mtmd_batch_max_tokens = 1024;

    // finetune
    struct lr_opt lr;
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
    using std::runtime_error::runtime_error;
 };

-std::vector<llama_device_memory_data> common_get_device_memory_data(
+static std::vector<llama_device_memory_data> common_get_device_memory_data_impl(
        const char * path_model,
        const llama_model_params * mparams,
        const llama_context_params * cparams,
@@ -150,6 +150,29 @@ std::vector<llama_device_memory_data> common_get_device_memory_data(
    return ret;
 }

+common_device_memory_data_vec common_get_device_memory_data(
+        const char * path_model,
+        const llama_model_params * mparams,
+        const llama_context_params * cparams,
+        std::vector<ggml_backend_dev_t> & devs,
+        uint32_t & hp_ngl,
+        uint32_t & hp_n_ctx_train,
+        uint32_t & hp_n_expert,
+        ggml_log_level log_level) {
+    std::vector<llama_device_memory_data> impl = common_get_device_memory_data_impl(
+            path_model, mparams, cparams, devs, hp_ngl, hp_n_ctx_train, hp_n_expert, log_level);
+
+    common_device_memory_data_vec ret(impl.size());
+    for (size_t i = 0; i < impl.size(); i++) {
+        ret[i].total   = impl[i].total;
+        ret[i].free    = impl[i].free;
+        ret[i].model   = impl[i].mb.model;
+        ret[i].context = impl[i].mb.context;
+        ret[i].compute = impl[i].mb.compute;
+    }
+    return ret;
+}
+
 static void common_params_fit_impl(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
@@ -169,7 +192,7 @@ static void common_params_fit_impl(
    // step 1: get data for default parameters and check whether any changes are necessary in the first place

    LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
-    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+    const dmds_t dmds_full = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
    const size_t nd = devs.size(); // number of devices

    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
@@ -304,7 +327,7 @@ static void common_params_fit_impl(

                    int64_t sum_projected_used_min_ctx = 0;
                    cparams->n_ctx = n_ctx_min;
-                    const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+                    const dmds_t dmds_min_ctx = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
                    if (nd == 0) {
                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
                    } else {
@@ -482,7 +505,7 @@ static void common_params_fit_impl(
        llama_model_params mparams_copy = *mparams;
        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);

-        const dmds_t dmd_nl = common_get_device_memory_data(
+        const dmds_t dmd_nl = common_get_device_memory_data_impl(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

        LOG_TRC("%s: memory for test allocation by device:\n", func_name);
@@ -510,7 +533,7 @@ static void common_params_fit_impl(
        mparams->tensor_buft_overrides = tensor_buft_overrides;

        LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
-        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
+        const dmds_t dmds_cpu_moe = common_get_device_memory_data_impl(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

        for (size_t id = 0; id < nd; id++) {
@@ -940,7 +963,7 @@ void common_fit_print(
    uint32_t hp_nct = 0; // hparams.n_ctx_train
    uint32_t hp_nex = 0; // hparams.n_expert

-    auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
+    auto dmd = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
    GGML_ASSERT(dmd.size() == devs.size() + 1);

    for (size_t id = 0; id < devs.size(); id++) {
@@ -1,9 +1,7 @@
 #pragma once

 #include "ggml.h"
-#include "ggml-backend.h"
 #include "llama.h"
-#include "../src/llama-ext.h"

 #include <vector>

@@ -18,31 +16,41 @@ enum common_params_fit_status {
 //   - this function is NOT thread safe because it modifies the global llama logger state
 //   - only parameters that have the same value as in llama_default_model_params are modified
 //     with the exception of the context size which is modified if and only if equal to 0
-enum common_params_fit_status common_fit_params(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams,
-                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                     size_t * margins,               // margins of memory to leave per device in bytes
-                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+common_params_fit_status common_fit_params(
+                         const char * path_model,
+                 llama_model_params * mparams,
+               llama_context_params * cparams,
+                              float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+   llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                             size_t * margins,               // margins of memory to leave per device in bytes
+                           uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                     ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log

 // print estimated memory to stdout
 void common_fit_print(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams);
+                         const char * path_model,
+                 llama_model_params * mparams,
+               llama_context_params * cparams);

-void common_memory_breakdown_print(const struct llama_context * ctx);
+void common_memory_breakdown_print(const llama_context * ctx);
+
+struct common_device_memory_data {
+    int64_t total;
+    int64_t free;
+    size_t  model;
+    size_t  context;
+    size_t  compute;
+};
+
+using common_device_memory_data_vec = std::vector<common_device_memory_data>;

 // Load a model + context with no_alloc and return the per-device memory breakdown.
-std::vector<llama_device_memory_data> common_get_device_memory_data(
-                                  const char   * path_model,
-        const struct llama_model_params         * mparams,
-        const struct llama_context_params       * cparams,
-        std::vector<ggml_backend_dev_t>         & devs,
-                                      uint32_t  & hp_ngl,
-                                      uint32_t  & hp_n_ctx_train,
-                                      uint32_t  & hp_n_expert,
-                           enum ggml_log_level    log_level);
+common_device_memory_data_vec common_get_device_memory_data(
+                         const char * path_model,
+           const llama_model_params * mparams,
+         const llama_context_params * cparams,
+    std::vector<ggml_backend_dev_t> & devs,
+                           uint32_t & hp_ngl,
+                           uint32_t & hp_n_ctx_train,
+                           uint32_t & hp_n_expert,
+                     ggml_log_level   log_level);
@@ -0,0 +1,165 @@
+#include "imatrix-loader.h"
+#include "common.h"
+#include "log.h"
+#include "gguf.h"
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+
+static bool common_imatrix_load_legacy(const std::string & fname, common_imatrix & imatrix) {
+    std::ifstream in(fname, std::ios::binary);
+    if (!in) {
+        LOG_ERR("%s: failed to open %s\n", __func__, fname.c_str());
+        return false;
+    }
+
+    int n_entries;
+    in.read((char *) &n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        LOG_ERR("%s: no data in file %s\n", __func__, fname.c_str());
+        return false;
+    }
+
+    for (int i = 0; i < n_entries; ++i) {
+        int32_t len = 0;
+        in.read((char *) &len, sizeof(len));
+        std::vector<char> name_as_vec(len + 1);
+        in.read((char *) name_as_vec.data(), len);
+        if (in.fail()) {
+            LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname.c_str());
+            return false;
+        }
+        name_as_vec[len] = 0;
+        std::string name{ name_as_vec.data() };
+
+        int32_t ncall = 0;
+        in.read((char *) &ncall, sizeof(ncall));
+        int32_t nval = 0;
+        in.read((char *) &nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
+            return false;
+        }
+
+        auto & e = imatrix.entries[std::move(name)];
+        e.sums.resize(nval);
+        in.read((char *) e.sums.data(), nval * sizeof(float));
+        if (in.fail()) {
+            LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
+            return false;
+        }
+
+        e.counts.resize(1);
+        e.counts[0] = ncall;
+    }
+
+    // the trailing data (chunk count + dataset name) is optional
+    if (in.peek() != EOF) {
+        int32_t n_calls = 0;
+        in.read((char *) &n_calls, sizeof(n_calls));
+        imatrix.chunk_count = n_calls;
+
+        if (!in.fail()) {
+            int32_t len = 0;
+            in.read((char *) &len, sizeof(len));
+            if (!in.fail() && len > 0) {
+                std::vector<char> dataset(len + 1, 0);
+                in.read(dataset.data(), len);
+                if (!in.fail()) {
+                    imatrix.datasets.push_back(dataset.data());
+                }
+            }
+        }
+    }
+
+    imatrix.chunk_size = 0;
+    imatrix.is_legacy  = true;
+
+    return true;
+}
+
+bool common_imatrix_load(const std::string & fname, common_imatrix & imatrix) {
+    struct ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        return common_imatrix_load_legacy(fname, imatrix);
+    }
+
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
+    if (n_entries < 1) {
+        LOG_ERR("%s: no data in file %s\n", __func__, fname.c_str());
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
+        return false;
+    }
+
+    const int64_t datasets_key   = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
+    const int64_t chunk_count_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT);
+    const int64_t chunk_size_key  = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE);
+
+    if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) {
+        const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key);
+        imatrix.datasets.reserve(imatrix.datasets.size() + n);
+        for (int64_t i = 0; i < n; ++i) {
+            imatrix.datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i));
+        }
+    }
+
+    imatrix.has_metadata = (datasets_key != -1 && chunk_count_key != -1 && chunk_size_key != -1);
+    imatrix.chunk_count  = (chunk_count_key != -1) ? gguf_get_val_u32(ctx_gguf, chunk_count_key) : 0;
+    imatrix.chunk_size   = (chunk_size_key  != -1) ? gguf_get_val_u32(ctx_gguf, chunk_size_key)  : 0;
+
+    const std::string in_sum2_suffix{ ".in_sum2" };
+    const std::string counts_suffix{ ".counts" };
+
+    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string name = cur->name;
+
+        if (name.empty()) { continue; }
+
+        if (string_remove_suffix(name, in_sum2_suffix)) {
+            sums_counts_for[std::move(name)].first = cur;
+        } else if (string_remove_suffix(name, counts_suffix)) {
+            sums_counts_for[std::move(name)].second = cur;
+        }
+    }
+
+    for (const auto & sc : sums_counts_for) {
+        const std::string &        name    = sc.first;
+        const struct ggml_tensor * in_sum2 = sc.second.first;
+        const struct ggml_tensor * counts  = sc.second.second;
+
+        if (!in_sum2 || !counts) {
+            LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            return false;
+        }
+
+        auto & e = imatrix.entries[name];
+
+        const int64_t nval    = ggml_nelements(in_sum2);
+        const int64_t ncounts = ggml_nelements(counts);
+
+        e.sums.resize(nval);
+        for (int64_t j = 0; j < nval; ++j) {
+            e.sums[j] = ((const float *) in_sum2->data)[j];
+        }
+
+        e.counts.resize(ncounts);
+        for (int64_t j = 0; j < ncounts; ++j) {
+            e.counts[j] = std::lround(((const float *) counts->data)[j]);
+        }
+    }
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+    return true;
+}
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <vector>
+
+inline constexpr const char * LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
+inline constexpr const char * LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
+inline constexpr const char * LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
+
+struct common_imatrix_entry {
+    std::vector<float>   sums;
+    std::vector<int64_t> counts;
+};
+
+struct common_imatrix {
+    std::map<std::string, common_imatrix_entry> entries;
+    std::vector<std::string> datasets;
+    int32_t chunk_count    = 0;
+    int32_t chunk_size     = 0;
+    bool    is_legacy      = false;
+    bool    has_metadata   = false;
+};
+
+bool common_imatrix_load(const std::string & fname, common_imatrix & imatrix);
@@ -316,12 +316,22 @@ value filter_expression::execute_impl(context & ctx) {

    JJ_DEBUG("Applying filter to %s", input->type().c_str());

+    auto set_filter_alias = [](auto & filter_id) {
+        if (filter_id == "count") {
+            filter_id = "length";
+        } else if (filter_id == "d") {
+            filter_id = "default";
+        } else if (filter_id == "e") {
+            filter_id = "escape";
+        } else if (filter_id == "trim") {
+            filter_id = "strip";
+        }
+    };
+
    if (is_stmt<identifier>(filter)) {
        auto filter_id = cast_stmt<identifier>(filter)->val;

-        if (filter_id == "trim") {
-            filter_id = "strip"; // alias
-        }
+        set_filter_alias(filter_id);
        JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
        // TODO: Refactor filters so this coercion can be done automatically
        if (!input->is_undefined() && !is_val<value_string>(input) && (
@@ -345,9 +355,7 @@ value filter_expression::execute_impl(context & ctx) {
        }
        auto filter_id = cast_stmt<identifier>(call->callee)->val;

-        if (filter_id == "trim") {
-            filter_id = "strip"; // alias
-        }
+        set_filter_alias(filter_id);
        JJ_DEBUG("Applying filter '%s' with arguments to %s", filter_id.c_str(), input->type().c_str());
        func_args args(ctx);
        for (const auto & arg_expr : call->args) {
@@ -761,9 +769,9 @@ value member_expression::execute_impl(context & ctx) {

        if (is_stmt<slice_expression>(this->property)) {
            auto s = cast_stmt<slice_expression>(this->property);
-            value start_val = s->start_expr ? s->start_expr->execute(ctx) : mk_val<value_int>(0);
-            value stop_val  = s->stop_expr  ? s->stop_expr->execute(ctx)  : mk_val<value_int>(arr_size);
            value step_val  = s->step_expr  ? s->step_expr->execute(ctx)  : mk_val<value_int>(1);
+            value start_val = s->start_expr ? s->start_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val<value_int>(arr_size - 1) : mk_val<value_int>(0));
+            value stop_val  = s->stop_expr  ? s->stop_expr->execute(ctx)  : (step_val->as_int() < 0 ? mk_val<value_int>(-1) : mk_val<value_int>(arr_size));

            // translate to function call: obj.slice(start, stop, step)
            JJ_DEBUG("Member expression is a slice: start %s, stop %s, step %s",
@@ -90,14 +90,14 @@ static T slice(const T & array, int64_t start, int64_t stop, int64_t step = 1) {
            stop_val = std::min(stop_val, len);
        }
    } else {
-        start_val = len - 1;
+        start_val = start;
        if (start_val < 0) {
-            start_val = std::max(len + start_val, (int64_t)-1);
+            start_val = std::max(len + start_val, (int64_t)0);
        } else {
            start_val = std::min(start_val, len - 1);
        }

-        stop_val = -1;
+        stop_val = stop;
        if (stop_val < -1) {
            stop_val = std::max(len + stop_val, (int64_t)-1);
        } else {
@@ -673,6 +673,9 @@ const func_builtins & value_string_t::get_builtins() const {
            std::string str = val_input->as_string().str();
            // FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
            std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
+            if (delim.empty()) {
+                throw raised_exception("empty separator");
+            }
            int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
            auto result = mk_val<value_array>();
            size_t pos = 0;
@@ -697,6 +700,9 @@ const func_builtins & value_string_t::get_builtins() const {
            std::string str = val_input->as_string().str();
            // FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
            std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
+            if (delim.empty()) {
+                throw raised_exception("empty separator");
+            }
            int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
            auto result = mk_val<value_array>();
            size_t pos = 0;
@@ -722,10 +728,23 @@ const func_builtins & value_string_t::get_builtins() const {
            if (count > 0) {
                throw not_implemented_exception("String replace with count argument not implemented");
            }
-            size_t pos = 0;
-            while ((pos = str.find(old_str, pos)) != std::string::npos) {
-                str.replace(pos, old_str.length(), new_str);
-                pos += new_str.length();
+            if (old_str != new_str) {
+                size_t pos = 0;
+                if (old_str.empty()) {
+                    std::string new_res;
+                    new_res.reserve(str.length() + new_str.length() * (str.length() + 1));
+                    new_res += new_str;
+                    for (const char c : str) {
+                        new_res.push_back(c);
+                        new_res += new_str;
+                    }
+                    str = new_res;
+                } else {
+                    while ((pos = str.find(old_str, pos)) != std::string::npos) {
+                        str.replace(pos, old_str.length(), new_str);
+                        pos += new_str.length();
+                    }
+                }
            }
            auto res = mk_val<value_string>(str);
            res->val_str.mark_input_based_on(args.get_pos(0)->val_str);
@@ -1272,13 +1272,13 @@ common_peg_parser common_peg_parser_builder::string_content(char delimiter) {

 common_peg_parser common_peg_parser_builder::double_quoted_string() {
    return rule("double-quoted-string", [this]() {
-        return sequence({literal("\""), string_content('"'), literal("\""), space()});
+        return sequence({literal("\""), string_content('"'), literal("\"")});
    });
 }

 common_peg_parser common_peg_parser_builder::single_quoted_string() {
    return rule("single-quoted-string", [this]() {
-        return sequence({literal("'"), string_content('\''), literal("'"), space()});
+        return sequence({literal("'"), string_content('\''), literal("'")});
    });
 }

@@ -1301,25 +1301,25 @@ common_peg_parser common_peg_parser_builder::json_number() {
        // At EOF in partial mode, chars returns NEED_MORE → negate propagates NEED_MORE → number not committed.
        // This prevents premature commits of partial numbers (e.g. "3" when "3.14" is incoming).
        auto not_number_continuation = negate(chars("[0-9.eE+-]", 1, 1));
-        return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation, space() });
+        return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation });
    });
 }

 common_peg_parser common_peg_parser_builder::json_string() {
    return rule("json-string", [this]() {
-        return sequence({literal("\""), string_content('"'), literal("\""), space()});
+        return sequence({literal("\""), string_content('"'), literal("\"")});
    });
 }

 common_peg_parser common_peg_parser_builder::json_bool() {
    return rule("json-bool", [this]() {
-        return sequence({choice({literal("true"), literal("false")}), space()});
+        return choice({literal("true"), literal("false")});
    });
 }

 common_peg_parser common_peg_parser_builder::json_null() {
    return rule("json-null", [this]() {
-        return sequence({literal("null"), space()});
+        return literal("null");
    });
 }

@@ -1334,8 +1334,7 @@ common_peg_parser common_peg_parser_builder::json_object() {
            choice({
                literal("}"),
                sequence({members, ws, literal("}")})
-            }),
-            ws
+            })
        });
    });
 }
@@ -1350,8 +1349,7 @@ common_peg_parser common_peg_parser_builder::json_array() {
            choice({
                literal("]"),
                sequence({elements, ws, literal("]")})
-            }),
-            ws
+            })
        });
    });
 }
@@ -1381,16 +1379,13 @@ common_peg_parser common_peg_parser_builder::python_number() {

 common_peg_parser common_peg_parser_builder::python_bool() {
    return rule("python-bool", [this]() {
-        return sequence({
-            choice({literal("True"), literal("False")}),
-            space()
-        });
+        return choice({literal("True"), literal("False")});
    });
 }

 common_peg_parser common_peg_parser_builder::python_null() {
    return rule("python-none", [this]() {
-        return sequence({literal("None"), space()});
+        return literal("None");
    });
 }

@@ -1512,6 +1507,7 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
    auto pieces = matcher.collect_prefix_and_next();

    std::string pattern;
+    std::string trailing;  // optional proper-prefix of a delimiter, allowed only at the very end
    for (size_t i = 0; i < pieces.size(); ++i) {
        if (i > 0) {
            pattern += " | ";
@@ -1527,13 +1523,32 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
        }

        if (!pre.empty()) {
-            pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
+            std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
+            pattern += pre_literal + " [^" + cls + "]";
+            // Each interior alternative consumes a delimiter-prefix plus a disambiguating
+            // char, so the repetition alone cannot match a value that *ends* on a proper
+            // prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
+            // "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
+            // values, so without this the grammar would reject input the parser accepts.
+            // Allow the value to terminate on any proper prefix as an optional tail.
+            // This makes the grammar a slight superset of the runtime language (a value
+            // may end on the longest prefix, which greedy first-match would not itself
+            // produce); harmless for constrained generation, which only needs to admit
+            // every runtime-valid string.
+            if (!trailing.empty()) {
+                trailing += " | ";
+            }
+            trailing += pre_literal;
        } else {
            pattern += "[^" + cls + "]";
        }
    }

-    return "(" + pattern + ")*";
+    std::string result = "(" + pattern + ")*";
+    if (!trailing.empty()) {
+        result += " (" + trailing + ")?";
+    }
+    return result;
 }

 static std::unordered_set<std::string> collect_reachable_rules(
@@ -769,54 +769,63 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
    }
 }

-std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
-        { "dry",         COMMON_SAMPLER_TYPE_DRY },
-        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
-        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
-        { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
-        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
-        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
-        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
-        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
-        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
-        { "adaptive_p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
-    };
-
-    // since samplers names are written multiple ways
-    // make it ready for both system names and input names
-    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
-        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
-        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
-        { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
-        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
-        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
-        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
-        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
-        { "adaptive-p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
-    };
+std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names) {
+    // sampler names can be written multiple ways; generate aliases from canonical names
+    static const auto sampler_name_map = []{
+        // canonical sampler name mapping
+        std::unordered_map<std::string, common_sampler_type> canonical_name_map {
+            { "dry",         COMMON_SAMPLER_TYPE_DRY         },
+            { "top_k",       COMMON_SAMPLER_TYPE_TOP_K       },
+            { "top_p",       COMMON_SAMPLER_TYPE_TOP_P       },
+            { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
+            { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P   },
+            { "min_p",       COMMON_SAMPLER_TYPE_MIN_P       },
+            { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
+            { "xtc",         COMMON_SAMPLER_TYPE_XTC         },
+            { "infill",      COMMON_SAMPLER_TYPE_INFILL      },
+            { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES   },
+            { "adaptive_p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P  }
+        };
+        std::unordered_map<std::string, common_sampler_type> alias_name_map;
+        for (const auto & entry : canonical_name_map) {
+            const std::string & canonical = entry.first;
+            if (canonical.find('_') == std::string::npos) {
+                continue;
+            }
+            // kebab-case: "top-k", "min-p", etc.
+            {
+                std::string kebab_case = canonical;
+                std::replace(kebab_case.begin(), kebab_case.end(), '_', '-');
+                alias_name_map.insert({kebab_case, entry.second});
+            }
+            // no dash: "topk", "minp", etc.
+            {
+                std::string no_dash = canonical;
+                no_dash.erase(std::remove(no_dash.begin(), no_dash.end(), '_'), no_dash.end());
+                alias_name_map.insert({no_dash, entry.second});
+            }
+        }
+        // misc. aliases
+        alias_name_map.insert({"nucleus", COMMON_SAMPLER_TYPE_TOP_P});
+        alias_name_map.insert({"temp",    COMMON_SAMPLER_TYPE_TEMPERATURE});
+        alias_name_map.insert({"typ",     COMMON_SAMPLER_TYPE_TYPICAL_P});
+        // include aliases + canonical names in the complete mapping
+        alias_name_map.merge(canonical_name_map);
+        return alias_name_map;
+    }();

    std::vector<common_sampler_type> samplers;
    samplers.reserve(names.size());

    for (const auto & name : names) {
-        auto sampler = sampler_canonical_name_map.find(name);
-        if (sampler != sampler_canonical_name_map.end()) {
+        std::string name_lower = name;
+        std::transform(name_lower.begin(), name_lower.end(), name_lower.begin(), ::tolower);
+        auto sampler = sampler_name_map.find(name_lower);
+        if (sampler != sampler_name_map.end()) {
            samplers.push_back(sampler->second);
            continue;
        }
-        if (allow_alt_names) {
-            sampler = sampler_alt_name_map.find(name);
-            if (sampler != sampler_alt_name_map.end()) {
-                samplers.push_back(sampler->second);
-                continue;
-            }
-        }
-        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
+        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name_lower.c_str());
    }

    return samplers;
@@ -109,7 +109,7 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx,
 char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
 std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

-std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names);
 std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);

 llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
@@ -3,13 +3,14 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP)
 #include "log.h"
 #include "ngram-cache.h"
 #include "ngram-map.h"
 #include "ngram-mod.h"
 #include "sampling.h"

+#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP)
+
 #include <algorithm>
 #include <cassert>
 #include <cstring>
@@ -58,10 +59,10 @@ static bool common_speculative_are_compatible(
    const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
    const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);

-    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    const auto vocab_type_tgt = llama_vocab_type(vocab_tgt);
    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);

-    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    const auto vocab_type_dft = llama_vocab_type(vocab_dft);
    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
@@ -374,31 +375,437 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
    }
 };

+
+// EAGLE3 speculative decoding state
+//
+// Input of draft decoder: (This is different compared to MTP)
+//   At "pos P", the decoder takes input pair (t_{P+1}, g_P), with RoPE at P.
+//     - t_{P+1} = token at sequence pos P+1 (the *next* token after P)
+//     - g_P     = encoder output = projection of target's extracted hidden states at P
+//
+// Deferred boundary (MTP doesn't have this issue):
+//   Within a single process() call with n_tokens, we can only write decoder KV for
+//   training pos 0..n_tokens-2. The last training pos (n_tokens-1) needs t_{n_tokens}
+//   which lies *outside* this batch — it is the token target will sample next or the first token from next ubatch.
+//   So the last training pos of each process() call is *deferred* to whichever next call has
+//   the missing token in hand:
+//     - multi-ubatch prefill: the next process()'s first token completes the pair
+//                              (handled by the per-seq "cross-ubatch bridge")
+//     - single-ubatch prefill / after verify: draft()'s seed step uses "dp.id_last"
+//                              (target's freshest sample) to complete the pair
+//
+// Per-seq carry-over state:
+//   pending_g_last    [n_embd_dec]  ┐  the deferred boundary's (g, pos). Set by
+//   pending_pos_last  llama_pos     ┘  process() at end of ubatch (= last row);
+//                                       rebased by accept() to first-non-accepted pos.
+//   verify_g          [N × n_embd_dec] snapshot of process()'s encoder output;
+//   verify_pos_first  llama_pos         consumed by accept() to recover the right
+//   verify_g_rows     int32_t           pending_g_last row for any n_accepted value.
+//
+// Performance is overall good but there is waste in verify cycle:
+//   process() runs encoder + decoder on the *full* verify batch including rows for
+//   rejected drafts. The KV at those positions is then dropped.
+//
+// TODO: Not sure if we need optimization for this waste?
+// If so we may need hybrid stash:
+//      in verify mode, have process() only stash features and let draft() seed run
+//      encoder+decoder on n_accepted+1 rows).
 struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
-    //common_params_speculative_eagle3 params;
+    common_params_speculative_draft params;
+    llama_batch batch;
+
+    std::vector<common_sampler_ptr> smpls;
+
+    int32_t n_embd_dec = 0;       // draft hidden size
+    int32_t n_embd_enc = 0;       // target_layer_ids_n * target_hidden_size
+    int32_t n_embd_tgt = 0;       // target model hidden size
+
+    const int32_t * target_layer_ids   = nullptr; // model_dft's extract layer indices
+    uint32_t        target_layer_ids_n = 0;
+
+    // [per-seq] deferred boundary state
+    std::vector<std::vector<float>> pending_g_last;
+    std::vector<llama_pos>          pending_pos_last;
+
+    // [per-seq] snapshot of the most recent process()'s encoder output
+    std::vector<std::vector<float>> verify_g;         // [n_seq][n_rows * n_embd_dec]
+    std::vector<llama_pos>          verify_pos_first; // [n_seq] — pos of verify_g[seq][0]
+    std::vector<int32_t>            verify_g_rows;    // [n_seq] — number of rows
+
+    // scratch buffer for concatenated target features [n_tokens, n_embd_enc]
+    std::vector<float> features_buf;
+    std::vector<float> g_embd_buf;

    common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
+        , params(params.draft)
    {
        LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
+
+        auto * ctx_tgt = this->params.ctx_tgt;
+        auto * ctx_dft = this->params.ctx_dft;
+        GGML_ASSERT(ctx_tgt && ctx_dft && "EAGLE3 requires ctx_tgt and ctx_dft to be set");
+
+        const llama_model * model_dft = llama_get_model(ctx_dft);
+        const llama_model * model_tgt = llama_get_model(ctx_tgt);
+
+        target_layer_ids   = llama_model_target_layer_ids  (model_dft);
+        target_layer_ids_n = llama_model_target_layer_ids_n(model_dft);
+        if (target_layer_ids_n != 3) {
+            throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " +
+                                     std::to_string(target_layer_ids_n) + ")");
+        }
+
+        n_embd_tgt = llama_model_n_embd(model_tgt);
+        n_embd_dec = llama_model_n_embd(model_dft);
+        n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt;
+
+        const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
+        batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1);
+        // llama_batch_init allocates only one of token/embd; eagle3 decoder needs both.
+        // TODO: fix, how to call without malloc
+        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b);
+
+        smpls.resize(n_seq);
+        for (auto & s : smpls) {
+            common_params_sampling sparams;
+            sparams.no_perf  = false;
+            sparams.top_k    = 10;
+            sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
+            s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
+        }
+
+        // turn on extraction of the target layers' input embeddings
+        for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
+            llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
+        }
+
+        // turn on extraction of the draft model's pre-norm hidden state
+        // (used both for the encoder output g_embd and the decoder pre-norm output).
+        llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
+
+        pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
+        pending_pos_last.assign(n_seq, -1);
+
+        verify_g.assign(n_seq, std::vector<float>());
+        verify_pos_first.assign(n_seq, -1);
+        verify_g_rows.assign(n_seq, 0);
    }

-    void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
-        // noop
+    ~common_speculative_impl_draft_eagle3() override {
+        if (batch.token != nullptr) {
+            free(batch.token);
+            batch.token = nullptr;
+        }
+        llama_batch_free(batch);
    }

-    bool process(const llama_batch & /*batch*/) override {
-        // TODO: implement
+    void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
+        const int32_t N = (int32_t) prompt.size();
+        if (N <= 0) {
+            return;
+        }
+        // expected state after prefill: ctx_dft has pos 0..N-2 (last position is deferred to
+        // draft()'s seed step). Warn only if more than one position is missing.
+        auto * ctx_dft = this->params.ctx_dft;
+        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
+        if (pos_max < N - 2) {
+            LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. "
+                    "Drafts may degrade.\n",
+                    __func__, (int) pos_max, N - 2);
+        }
+    }
+
+    bool process(const llama_batch & batch_in) override {
+        if (batch_in.n_tokens <= 0) {
+            return true;
+        }
+
+        if (batch_in.token == nullptr || batch_in.embd != nullptr) {
+            return true;
+        }
+
+        const int32_t n_tokens = batch_in.n_tokens;
+
+        // i_batch_beg[seq] / i_batch_end[seq]: inclusive batch indices of this seq's
+        // first/last token in batch_in. Assumes per-seq tokens are contiguous within
+        // the ubatch (server's default ordering).
+        std::vector<int32_t> i_batch_beg(n_seq, -1);
+        std::vector<int32_t> i_batch_end(n_seq, -1);
+        for (int k = 0; k < n_tokens; ++k) {
+            GGML_ASSERT(batch_in.n_seq_id[k] == 1);
+            const llama_seq_id seq_id = batch_in.seq_id[k][0];
+            if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+                continue;
+            }
+            i_batch_end[seq_id] = k;
+            if (i_batch_beg[seq_id] < 0) {
+                i_batch_beg[seq_id] = k;
+            }
+        }
+
+        auto * ctx_tgt = this->params.ctx_tgt;
+        auto * ctx_dft = this->params.ctx_dft;
+
+        // Interleave each extract_layer's hidden state into a contiguous buffer of
+        // shape [n_tokens, target_layer_ids_n * n_embd_tgt]. Then run EAGLE3 encoder
+        // to get one g_embd row per token.
+        features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
+
+        for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
+            const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]);
+            if (!layer) {
+                GGML_ABORT("EAGLE3: target layer %d input not extracted.", target_layer_ids[k]);
+            }
+            for (int32_t i = 0; i < n_tokens; ++i) {
+                float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt;
+                const float * src = layer + (size_t) i * n_embd_tgt;
+                std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float));
+            }
+        }
+
+        g_embd_buf.resize((size_t) n_tokens * n_embd_dec);
+
+        // llama_encode() requires the full encoder batch to fit in n_ubatch.
+        // Allow batch > ubatch: eagle3's per-token encoder can be chunked safely.
+        const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft);
+        for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) {
+            const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i);
+
+            llama_batch enc_batch = {
+                /*.n_tokens =*/ n_chunk,
+                /*.token    =*/ nullptr,
+                /*.embd     =*/ features_buf.data() + (size_t) i * n_embd_enc,
+                /*.pos      =*/ nullptr,
+                /*.n_seq_id =*/ nullptr,
+                /*.seq_id   =*/ nullptr,
+                /*.logits   =*/ nullptr,
+            };
+            const int32_t rc = llama_encode(ctx_dft, enc_batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
+                        __func__, rc, (int) n_chunk, (int) i);
+                return false;
+            }
+
+            // g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer.
+            const float * g_embd_chunk = llama_get_embeddings_nextn(ctx_dft);
+            GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output.");
+            std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec,
+                        g_embd_chunk,
+                        (size_t) n_chunk * n_embd_dec * sizeof(float));
+        }
+
+        const float * g_embd = g_embd_buf.data();
+
+        const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
+
+        // EAGLE3 decoder input convention: at memory pos P the input pair is
+        // (token[P+1], g_embd[P]). This shifts the token index "left by one" relative to g_embd.
+        //
+        // Per seq, in order:
+        //   (a) cross-ubatch bridge — when applicable, write the previously-deferred
+        //       pos using this ubatch's first token + pending_g_last.
+        //   (b) main write loop — for k in [beg, end-1], write (token[k+1], g_embd[k])
+        //       at pos[k]. The last training pos (k=end) is left unwritten = new
+        //       deferred boundary, completed by the next process() or draft() call.
+        //   (c) refresh deferred state — stash this ubatch's full g_embd into verify_g,
+        //       update pending_g_last / pending_pos_last to the last row.
+        common_batch_clear(batch);
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            const int32_t beg = i_batch_beg[seq_id];
+            const int32_t end = i_batch_end[seq_id];
+            if (beg < 0 || end < 0) {
+                continue;
+            }
+
+            // cross-ubatch bridge — complete the prior ubatch's deferred boundary.
+            // Fires iff all three preconditions hold:
+            //   1) pending_pos_last >= 0
+            //   2) pending_pos_last + 1 == pos[beg]
+            //   3) pending_pos_last > dft_pos_max // TODO: is this check needed?
+            const llama_pos pending_pos = pending_pos_last[seq_id];
+            if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) {
+                const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
+                if (pending_pos > dft_pos_max) {
+                    common_batch_add(batch, batch_in.token[beg], pending_pos, { seq_id }, /*logits=*/ false);
+                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                                pending_g_last[seq_id].data(), row_bytes);
+                }
+            }
+
+            for (int32_t k = beg; k < end; ++k) {
+                common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], { seq_id }, /*logits=*/ false);
+                std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                            g_embd + (size_t) k * n_embd_dec, row_bytes);
+            }
+
+            // refresh deferred state
+            const int32_t n_rows = end - beg + 1;
+            verify_pos_first[seq_id] = batch_in.pos[beg];
+            pending_pos_last[seq_id] = batch_in.pos[end];
+            verify_g_rows[seq_id]    = n_rows;
+            verify_g[seq_id].resize((size_t) n_rows * n_embd_dec, 0.0f);
+            std::memcpy(verify_g[seq_id].data(),       g_embd + (size_t) beg * n_embd_dec, row_bytes * n_rows);
+            std::memcpy(pending_g_last[seq_id].data(), g_embd + (size_t) end * n_embd_dec, row_bytes);
+        }
+
+        if (batch.n_tokens > 0) {
+            const int32_t rc = llama_decode(ctx_dft, batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
+                        __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
+                return false;
+            }
+        }
+
        return true;
    }

-    void draft(common_speculative_draft_params_vec & /*dparams*/) override {
-        // TODO: implement
+    void draft(common_speculative_draft_params_vec & dparams) override {
+        auto & ctx_dft = params.ctx_dft;
+
+        common_batch_clear(batch);
+
+        // keep track of which sequences are still drafting
+        int n_drafting = 0;
+        std::vector<bool> drafting(n_seq);
+
+        const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
+
+        // Complete the deferred boundary pair (dp.id_last, pending_g_last) at memory
+        // pos pending_pos_last. dp.id_last is target's freshest sample (= corrected
+        // token after verify, or first generated token after prefill), matching the
+        // EAGLE3 input convention (token[P+1], g_embd[P]) at pos P.
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+
+            if (!dp.drafting) {
+                continue;
+            }
+            if (pending_pos_last[seq_id] < 0) {
+                continue;
+            }
+
+            n_drafting++;
+            drafting[seq_id] = true;
+            common_sampler_reset(smpls[seq_id].get());
+
+            llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, pending_pos_last[seq_id], -1);
+
+            common_batch_add(batch, dp.id_last, pending_pos_last[seq_id], { seq_id }, true);
+            std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                        pending_g_last[seq_id].data(),
+                        row_bytes);
+        }
+
+        if (batch.n_tokens == 0) {
+            return;
+        }
+
+        int ret = llama_decode(ctx_dft, batch);
+        if (ret != 0) {
+            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
+            return;
+        }
+
+        int i = 0;
+
+        while (n_drafting > 0) {
+            int i_batch = 0;
+
+            common_batch_clear(batch);
+
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                if (!drafting[seq_id]) {
+                    continue;
+                }
+
+                auto * smpl = smpls[seq_id].get();
+
+                common_sampler_sample(smpl, ctx_dft, i_batch, true);
+                // pre-norm hidden state of this position becomes g_embd for the next step
+                const float * prenorm = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
+                ++i_batch;
+
+                const auto * cur_p = common_sampler_get_candidates(smpl, true);
+
+                for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
+                    LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                            seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
+                            common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
+                }
+
+                const llama_token id = cur_p->data[0].id;
+
+                // only collect very high-confidence draft tokens
+                // (configurable via --spec-draft-p-min, set to 0.0 to disable early-stop)
+                if (cur_p->data[0].p < params.p_min) {
+                    drafting[seq_id] = false;
+                    n_drafting--;
+
+                    continue;
+                }
+
+                common_sampler_accept(smpl, id, true);
+
+                auto & dp = dparams.at(seq_id);
+                auto & result = *dp.result;
+
+                result.push_back(id);
+
+                if (params.n_max <= (int) result.size()) {
+                    drafting[seq_id] = false;
+                    n_drafting--;
+                    continue;
+                }
+
+                common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true);
+                std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, prenorm, row_bytes);
+            }
+
+            if (batch.n_tokens == 0) {
+                break;
+            }
+
+            ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                break;
+            }
+
+            ++i;
+        }
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+            if (!dp.drafting) {
+                continue;
+            }
+
+            if (dp.result->size() < (size_t) params.n_min) {
+                dp.result->clear();
+            }
+        }
    }

-    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
-        // noop
+    void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+            return;
+        }
+
+        const int32_t n_rows = verify_g_rows[seq_id];
+        if (n_rows <= 0) {
+            return;
+        }
+
+        const int32_t i_g = std::min<int32_t>(n_accepted, n_rows - 1);
+        pending_pos_last[seq_id] = verify_pos_first[seq_id] + i_g;
+        std::memcpy(pending_g_last[seq_id].data(),
+                    verify_g[seq_id].data() + (size_t) i_g * n_embd_dec,
+                    (size_t) n_embd_dec * sizeof(float));
    }

    bool need_embd() const override {
@@ -418,6 +825,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

    int32_t n_embd = 0;

+    bool is_mem_shared = false;
+
    // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
    // The last h-row of one process() call needs the first token of the NEXT
    // call to pair with, so it's stashed here until that next call fires.
@@ -444,7 +853,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        auto * ctx_dft = this->params.ctx_dft;
        GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set");

-        n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
+        n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft));
+        GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) &&
+                "MTP input row width must match the target h_nextn width");

        LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
@@ -490,6 +901,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false);
        llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);

+        is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt;
+
        pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));

        i_batch_beg.assign(n_seq, -1);
@@ -526,9 +939,11 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        if (N <= 0) {
            return;
        }
+
        auto * ctx_dft = this->params.ctx_dft;
        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
-        if (pos_max < N - 1) {
+
+        if (pos_max < N - 1 && !is_mem_shared) {
            LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - "
                    "process() hook may not have run on every prefill ubatch "
                    "(need_embd / logits=1 on every prompt position?). "
@@ -571,48 +986,42 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

        const size_t row_bytes = (size_t) n_embd * sizeof(float);

-        common_batch_clear(batch);
+        // if kv is shared with target (e.g Gemma4), then we can skip this catch-up decode
+        if (!is_mem_shared) {
+            common_batch_clear(batch);

-        for (int k = 0; k < n_tokens; ++k) {
-            common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0);
-        }
-
-        // shift the tgt embeddings to the right by one position
-        // assumes that the tokens in the batch are sequential for each sequence
-        // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1]
-        //                                                       ^--- this is a problem
-        // TODO:this is generally true, but would be nice to assert it
-        {
-            const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt);
-            std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
-
-            //{
-            //    // string with seq_ids in the batch
-            //    std::stringstream ss;
-            //    for (int i = 0; i < n_tokens; ++i) {
-            //        ss << batch_in.seq_id[i][0] << ",";
-            //    }
-            //    LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str());
-            //}
-        }
-
-        // fill the pending embeddings from a previous run
-        auto set_h = [&](int idx, const float * h_row) {
-            std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes);
-        };
-
-        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
-            if (i_batch_beg[seq_id] < 0) {
-                continue;
+            for (int k = 0; k < n_tokens; ++k) {
+                common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0);
            }

-            set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
-        }
+            // shift the tgt embeddings to the right by one position
+            // assumes that the tokens in the batch are sequential for each sequence
+            // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1]
+            //                                                       ^--- this is a problem
+            // TODO:this is generally true, but would be nice to assert it
+            {
+                const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt);
+                std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
+            }

-        const int32_t rc = llama_decode(ctx_dft, batch);
-        if (rc != 0) {
-            LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
-            return false;
+            // fill the pending embeddings from a previous run
+            auto set_h = [&](int idx, const float * h_row) {
+                std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes);
+            };
+
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                if (i_batch_beg[seq_id] < 0) {
+                    continue;
+                }
+
+                set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
+            }
+
+            const int32_t rc = llama_decode(ctx_dft, batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
+                return false;
+            }
        }

        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
@@ -721,7 +1130,13 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                    continue;
                }

-                common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
+                if (is_mem_shared) {
+                    // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens
+                    // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37
+                    common_batch_add(batch, id, dp.n_past, { seq_id }, true);
+                } else {
+                    common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
+                }
                std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
            }

@@ -834,7 +1249,8 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
    common_speculative_impl_ngram_map_k(
            const common_ngram_map & config,
            uint32_t n_seq)
-        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq)
+        : common_speculative_impl(config.key_only ? COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K
+            : COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, n_seq)
    {
        for (uint32_t i = 0; i < n_seq; i++) {
            this->config.push_back(config);
@@ -1360,9 +1776,11 @@ common_speculative * common_speculative_init(common_params_speculative & params,
        uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);

        bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
-        bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
+        bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
        bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;

+
+
        bool has_ngram_cache   = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE));
        bool has_ngram_simple  = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE));
        bool has_ngram_map_k   = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K));
@@ -40,6 +40,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "ChatGLMModel": "chatglm",
    "CodeShellForCausalLM": "codeshell",
    "CogVLMForCausalLM": "cogvlm",
+    "Cohere2MoeForCausalLM": "command_r",
    "Cohere2ForCausalLM": "command_r",
    "CohereForCausalLM": "command_r",
    "DbrxForCausalLM": "dbrx",
@@ -75,9 +76,11 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "Gemma3TextModel": "gemma",
    "Gemma3nForCausalLM": "gemma",
    "Gemma3nForConditionalGeneration": "gemma",
+    "Gemma4AssistantForCausalLM": "gemma",
    "Gemma4ForConditionalGeneration": "gemma",
    "Gemma4ForCausalLM": "gemma",
    "Gemma4UnifiedForConditionalGeneration": "gemma",
+    "Gemma4UnifiedAssistantForCausalLM": "gemma",
    "GemmaForCausalLM": "gemma",
    "Glm4ForCausalLM": "glm",
    "Glm4MoeForCausalLM": "glm",
@@ -128,6 +131,9 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LlamaBidirectionalModel": "llama",
    "LlamaForCausalLM": "llama",
    "LlamaModel": "llama",
+    "Eagle3DraftModel": "llama",
+    "Eagle3Speculator": "llama",
+    "LlamaForCausalLMEagle3": "llama",
    "LlavaForConditionalGeneration": "llama",
    "LlavaStableLMEpochForCausalLM": "stablelm",
    "MPTForCausalLM": "mpt",
@@ -253,6 +259,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "Glm4vMoeForConditionalGeneration": "qwen3vl",
    "GlmOcrForConditionalGeneration": "qwen3vl",
    "GlmasrModel": "ultravox",
+    "Granite4VisionForConditionalGeneration": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
    "HunYuanVLForConditionalGeneration": "hunyuan",
    "Idefics3ForConditionalGeneration": "smolvlm",
@@ -94,6 +94,7 @@ class ModelBase:
    metadata: gguf.Metadata
    dir_model_card: Path
    remote_hf_model_id: str | None
+    target_model_dir: Path | None

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@@ -119,6 +120,7 @@ class ModelBase:
                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
                 disable_mistral_community_chat_template: bool = False,
                 sentence_transformers_dense_modules: bool = False,
+                 target_model_dir: Path | None = None,
                 fuse_gate_up_exps: bool = False,
                 fp8_as_q8: bool = False):
        if type(self) is ModelBase or \
@@ -139,6 +141,7 @@ class ModelBase:
        self.dry_run = dry_run
        self.remote_hf_model_id = remote_hf_model_id
        self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
+        self.target_model_dir = target_model_dir
        self.fuse_gate_up_exps = fuse_gate_up_exps
        self._gate_exp_buffer: dict[int, Tensor] = {}
        self._up_exp_buffer: dict[int, Tensor] = {}
@@ -1192,7 +1195,7 @@ class TextModel(ModelBase):
            self.gguf_writer.add_embedding_length(n_embd)
            logger.info(f"gguf: embedding length = {n_embd}")

-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
+        if (n_ff := self.find_hparam(["prefix_dense_intermediate_size", "intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
            logger.info(f"gguf: feed forward length = {n_ff}")

@@ -1277,7 +1280,7 @@ class TextModel(ModelBase):
            self.gguf_writer.add_expert_group_used_count(n_group_used)
            logger.info(f"gguf: expert groups used count = {n_group_used}")

-        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
+        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func", "expert_selection_fn"], optional=True)) is not None:
            if score_func == "sigmoid":
                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
            elif score_func == "softmax":
@@ -1492,6 +1495,9 @@ class TextModel(ModelBase):
        if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
            # ref: https://huggingface.co/CohereLabs/tiny-aya-base
            res = "tiny_aya"
+        if chkhsh == "52df12b4c8d4176e7481aab4b6e8454d1fd0a210a04a574f6d4e067d10e23c3e":
+            # ref: https://huggingface.co/CohereLabs/North-Mini-Code-1.0
+            res = "cohere2moe"
        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
            res = "qwen2"
@@ -2481,6 +2487,7 @@ class LazyTorchTensor(gguf.LazyBase):
        torch.float16: np.float16,
        torch.float32: np.float32,
        torch.uint8: np.uint8,
+        torch.int64: np.int64,
    }

    # only used when byteswapping data. Only correct size is needed
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import re
 from typing import Iterable, TYPE_CHECKING

 import torch
@@ -55,3 +56,122 @@ class Cohere2Model(TextModel):
            return

        yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Cohere2MoeForCausalLM")
+class Cohere2MoeModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.COHERE2MOE
+    _n_main_layers: int | None = None
+    _expert_tensor_re = re.compile(
+        r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight"
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp:
+            self.block_count += n_nextn
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+        self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)]
+
+    def _set_vocab_gpt2(self) -> None:
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        expert_intermediate_size = hparams["intermediate_size"]
+        mlp_layer_types = hparams.get("mlp_layer_types")
+        n_dense_lead = hparams.get("first_k_dense_replace", 0)
+        if mlp_layer_types is not None:
+            n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types))
+
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_logit_scale(hparams["logit_scale"])
+        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
+        self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
+        self.gguf_writer.add_leading_dense_block_count(n_dense_lead)
+        self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False))
+        if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0:
+            if hparams.get("shared_expert_combination_strategy", "average") != "average":
+                raise ValueError("Cohere2 MoE only supports average shared expert combination")
+            self.gguf_writer.add_expert_shared_count(num_shared_experts)
+            self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts)
+        if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp:
+            self.gguf_writer.add_nextn_predict_layers(n_nextn)
+        self.gguf_writer.add_rope_dimension_count(hparams["head_dim"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+    def index_tensors(self, remote_hf_model_id: str | None = None):
+        hparams = {**self.hparams, **self.hparams.get("text_config", {})}
+        self._n_main_layers = hparams.get("num_hidden_layers")
+        type(self)._n_main_layers = self._n_main_layers
+        return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
+
+    @classmethod
+    def filter_tensors(cls, item):
+        if (titem := super().filter_tensors(item)) is None:
+            return None
+        name, gen = titem
+
+        if cls._n_main_layers is not None:
+            is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
+            if is_mtp and cls.no_mtp:
+                return None
+            if cls.mtp_only and not is_mtp and name not in (
+                "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
+            ):
+                return None
+
+        return name, gen
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith(".bias"):
+            if torch.any(data_torch != 0):
+                raise ValueError(f"Bias tensor {name!r} is not zero.")
+            logger.debug(f"Skipping bias tensor {name!r}.")
+            return
+
+        if (m := self._expert_tensor_re.fullmatch(name)) is not None:
+            n_experts = self.hparams["num_experts"]
+            layer_idx = int(m.group(1))
+            assert bid is None or bid == layer_idx
+
+            self._experts[layer_idx][name] = data_torch
+
+            expected = {
+                f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
+                for xid in range(n_experts)
+                for w_name in ("down_proj", "gate_proj", "up_proj")
+            }
+            if expected.issubset(self._experts[layer_idx]):
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[layer_idx][ename])
+                        del self._experts[layer_idx][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+                    merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight"
+
+                    yield from super().modify_tensors(data_torch, merged_name, layer_idx)
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        experts = [k for d in self._experts for k in d.keys()]
+        if len(experts) > 0:
+            raise ValueError(f"Unprocessed experts: {experts}")
@@ -785,6 +785,26 @@ class Gemma4UnifiedModel(Gemma4Model):
            self.gguf_writer.add_suppress_tokens(suppress_tokens)


+@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM")
+class Gemma4AssistantModel(Gemma4Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+
+        if "masked_embedding" in name:
+            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+            return None
+
+        return super().filter_tensors(item)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])
+        self.gguf_writer.add_nextn_predict_layers(self.block_count)
+
+
@ModelBase.register("Gemma4ForConditionalGeneration")
 class Gemma4VisionAudioModel(MmprojModel):
    has_audio_encoder = True
@@ -798,7 +818,8 @@ class Gemma4VisionAudioModel(MmprojModel):
        # remap audio hparams
        if self.hparams_audio:
            self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
-            self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
+            if "hidden_size" in self.hparams_audio:
+                self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
        else:
            self.has_audio_encoder = False

@@ -811,10 +832,11 @@ class Gemma4VisionAudioModel(MmprojModel):
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))

        # audio params
-        assert self.hparams_audio is not None
-        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
-        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
-        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
+        if self.has_audio_encoder:
+            assert self.hparams_audio is not None
+            self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
+            self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+            self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))

    def is_audio_tensor(self, name: str) -> bool:
        return "audio_tower" in name or "embed_audio" in name
@@ -872,7 +894,7 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
        assert self.hparams_audio is not None
        text_embd_dim = self.hparams_vision["mm_embed_dim"]
        self.hparams_vision["hidden_size"] = text_embd_dim
-        self.hparams_audio["hidden_size"] = text_embd_dim
+        self.hparams_audio["hidden_size"] = self.hparams_audio["audio_embed_dim"]
        # this is a transformer-less vision tower, the params below are redundant but set to avoid error
        self.hparams_vision["intermediate_size"] = 0
        self.hparams_vision["num_layers"] = 0
@@ -897,7 +919,10 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
            # ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC).
            # Permute columns so column i aligns with CHW input position i.
            assert self.hparams_vision is not None
-            p = self.hparams_vision["model_patch_size"]
+            if "model_patch_size" in self.hparams_vision:
+                p = self.hparams_vision["model_patch_size"]
+            else:
+                p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"]
            i = torch.arange(p * p * 3)
            ch  = i // (p * p)
            row = (i % (p * p)) // p
@@ -908,7 +933,10 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
        elif "patch_ln1.weight" in name or "patch_ln1.bias" in name:
            # same permutation for patch_ln1 as patch_dense to align with CHW input order
            assert self.hparams_vision is not None
-            p = self.hparams_vision["model_patch_size"]
+            if "model_patch_size" in self.hparams_vision:
+                p = self.hparams_vision["model_patch_size"]
+            else:
+                p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"]
            i = torch.arange(p * p * 3)
            ch  = i // (p * p)
            row = (i % (p * p)) // p
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import re
 from typing import Any, Callable, Iterable, TYPE_CHECKING

 import torch
@@ -13,7 +14,7 @@ from .llama import LlamaModel
 from .mamba import Mamba2Model


-@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
+@ModelBase.register("GraniteForCausalLM")
 class GraniteModel(LlamaModel):
    """Conversion for IBM's GraniteForCausalLM"""
    model_arch = gguf.MODEL_ARCH.GRANITE
@@ -46,11 +47,29 @@ class GraniteModel(LlamaModel):
            self.gguf_writer.add_logit_scale(logits_scale)
            logger.info("gguf: (granite) logits_scale = %s", logits_scale)

+        # If being used as the base for Granite4 Vision, add deepstack_layer_arr
+        if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"):
+            normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams)
+            deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels
+            for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map):
+                # Skip the first projector which is handled as the base embedding
+                # stream like normal
+                if proj_idx == 0:
+                    continue
+                deepstack_mapping_arr[llm_layer] = proj_idx
+            self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr)
+
    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
        name, gen = item
-        if name.startswith("encoder."):
-            return None
+        # Skip multimodal tensors
+        if (
+            name.startswith(("encoder."))
+            or "image_" in name
+            or "layerwise_projectors" in name
+            or "spatial_projectors" in name
+        ):
+            return
        return super().filter_tensors(item)


@@ -241,7 +260,8 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
        assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"

    def set_vocab(self):
-        self.hparams["pad_vocab_size_multiple"] = 8
+        # For models with no ssm layers, don't pad for mamba2
+        self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1
        Mamba2Model.set_vocab(self)


@@ -326,3 +346,133 @@ class GraniteSpeechMmprojModel(MmprojModel):
                data_torch = data_torch.squeeze(1)

        yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Granite4VisionForConditionalGeneration")
+class Granite4VisionMmprojModel(MmprojModel):
+    has_vision_encoder = True
+    has_audio_encoder = False
+
+    @staticmethod
+    def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]:
+        """Normalize both deepstack and spatial projector maps to the form:
+        (vision_layer, llm_layer, <type>, type_index)
+
+        This is then used to populate the following mappings:
+        - vision_feature_layers (mmproj hparam): ordered list of all
+          vision_layer values where order corresponds with the order of the
+          stacked projector tensors
+          NOTE: Values may appear multiple times for spatial projectors
+        - tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to
+          the index of the corresponding projector in the stacked tensors
+        - deepstack_layer_arr (llm hparam): per-text-layer array indicating
+          which input vision feature should be injected at that layer
+          (-1 if none)
+
+        Output: (vision_layer, llm_layer, <type>, type_index)
+        """
+        deepstack_map = global_config.get("deepstack_layer_map", [])  # [[vis_layer, llm_layer], ...]
+        spatial_layers = global_config.get("spatial_target_layers", [])  # [llm_layer, ...]
+        n_text_layers = global_config["text_config"]["num_hidden_layers"]
+        n_vision_layers = global_config["vision_config"]["num_hidden_layers"]
+        normalized_projector_map = []
+        if deepstack_map:
+            for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)):
+                if vision_layer < 0:
+                    vision_layer = n_vision_layers + vision_layer
+                if llm_layer < 0:
+                    llm_layer = n_text_layers + llm_layer
+                normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx))
+        if spatial_layers:
+            spatial_vision_layer = global_config.get("spatial_vision_layer", -1)
+            if spatial_vision_layer < 0:
+                spatial_vision_layer = n_vision_layers + spatial_vision_layer
+            for spatial_idx, llm_layer in enumerate(spatial_layers):
+                normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx))
+        return list(sorted(normalized_projector_map, key=(lambda entry: entry[1])))
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        normalized_projector_map = self.get_normalized_projector_map(self.global_config)
+        self._n_proj = len(normalized_projector_map)
+
+        self._tensor_prefix_map = {
+            f"model.{proj_type}_projectors.{type_idx}": proj_idx
+            for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map)
+        }
+        self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map]
+        self._spatial_offsets = [
+            type_idx if proj_type == "spatial" else -1
+            for _, _, proj_type, type_idx in normalized_projector_map
+        ]
+
+    def set_gguf_parameters(self):
+        assert self.hparams_vision is not None
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION)
+
+        # SigLIP encoder hparams
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_use_gelu(True)
+
+        # Preprocessor
+        self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384))
+
+        # QFormer projector config
+        ds_rate = self.global_config["downsample_rate"]
+        ds_parts = ds_rate.split("/")
+        assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}"
+        query_side, window_side = [int(p) for p in ds_parts]
+        self.gguf_writer.add_vision_projector_query_side(query_side)
+        self.gguf_writer.add_vision_projector_window_side(window_side)
+
+        # Set vision feature layers
+        self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers)
+
+        # Set the spatial offests per projector
+        self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets)
+
+        # Add flattened image grind pinpoints (resolution candidates internally)
+        if pinpoints := self.global_config.get("image_grid_pinpoints"):
+            # Flatten with h, w -> w, h inversion
+            pinpoints = [val for h, w in pinpoints for val in (w, h)]
+            self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        if ("vision_model.head" in name or name.startswith("lm_head")):
+            return None
+        return super().filter_tensors(item)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+
+        # Detect projector tensors and bin them
+        projector_idx = None
+        for prefix, proj_idx in self._tensor_prefix_map.items():
+            if name.startswith(prefix):
+                projector_idx = proj_idx
+                break
+        if projector_idx is not None:
+            # If this projector tensor has a block id within the projector,
+            # alias the bid to projector_idx
+            #
+            # TODO: currently, none of the Granite 4 Vision models have
+            # projectors with multiple QFormer layers, so the `layer.{}` index
+            # is always 0. This allows us to simply map to a single `bid` that
+            # matches the projector index. If this changes, we'll need a
+            # convention that merges the two IDs.
+            id_matches = list(re.finditer(r"\.([0-9]+)\.", name))
+            all_ids = [int(m.group(1)) for m in id_matches]
+            assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names"
+            # If not layer id, just use the projector index
+            new_bid = projector_idx
+            if len(all_ids) == 1:
+                new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:]
+            else: # len(all_ids) == 2
+                new_bid = projector_idx # + all_ids[1]
+                new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:]
+            yield from super().modify_tensors(data_torch, new_name, new_bid)
+            return
+        yield from super().modify_tensors(data_torch, name, bid)
@@ -5,12 +5,13 @@ import math

 from typing import Callable, Iterable, TYPE_CHECKING

+import numpy as np
 import torch

 if TYPE_CHECKING:
    from torch import Tensor

-from .base import ModelBase, TextModel, gguf
+from .base import ModelBase, TextModel, gguf, logger


@ModelBase.register(
@@ -21,6 +22,9 @@ from .base import ModelBase, TextModel, gguf
    "VLlama3ForCausalLM",
    "LlavaForConditionalGeneration",
    "VoxtralForConditionalGeneration",
+    "LlamaForCausalLMEagle3",
+    "Eagle3Speculator",
+    "Eagle3DraftModel",
    "IQuestCoderForCausalLM",
    "LlamaModel")
 class LlamaModel(TextModel):
@@ -39,7 +43,61 @@ class LlamaModel(TextModel):
            hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
            self.origin_hf_arch = hparams.get('architectures', [None])[0]

+        # Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
+        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
+            self.is_eagle3 = True
+            self.model_arch = gguf.MODEL_ARCH.EAGLE3
+            logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
+            # Re-initialize tensor_map with eagle3 architecture
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+            # Update gguf_writer architecture
+            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
+            self.gguf_writer.add_architecture()
+            if self.target_model_dir is None:
+                raise ValueError(
+                    "EAGLE-3 model requires --target-model-dir to be specified. "
+                    "Please provide the path to the target model directory to read config.json"
+                )
+            # Read both eagle3 raw config and target model config
+            with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
+                eagle3_raw_config = json.load(f)
+            with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
+                target_config = json.load(f)
+
+            if "text_config" in target_config:
+                target_config = {**target_config, **target_config["text_config"]}
+            self.target_vocab_size = target_config["vocab_size"]
+
+            # target_layers: derived from target model layer count (low/mid/high)
+            target_num_layers = target_config["num_hidden_layers"]
+            target_layers = [2, target_num_layers // 2, target_num_layers - 3]
+            logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
+            self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)
+
+            # target_hidden_size: prefer eagle3 config, fallback to target config
+            if eagle3_raw_config.get("target_hidden_size") is not None:
+                target_hidden_size = eagle3_raw_config["target_hidden_size"]
+                src = "EAGLE-3 config"
+            else:
+                target_hidden_size = target_config["hidden_size"]
+                src = "target model config"
+            logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
+            self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
+
+            # norm_before_residual (RedHat-style eagle3 specific)
+            norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
+            logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
+            self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
+
    def set_vocab(self):
+        # eagle3: use tokenizer from target model if provided
+        original_dir_model = None
+        if getattr(self, 'is_eagle3', False):
+            assert self.target_model_dir is not None
+            logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
+            original_dir_model = self.dir_model
+            self.dir_model = self.target_model_dir
+
        if self.origin_hf_arch == "GlmasrModel":
            return self._set_vocab_glmedge()

@@ -85,6 +143,10 @@ class LlamaModel(TextModel):
        if self.hparams.get("vocab_size", 32000) == 49152:
            self.gguf_writer.add_add_bos_token(False)

+        # eagle3: Restore original dir_model
+        if original_dir_model is not None:
+            self.dir_model = original_dir_model
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
@@ -129,7 +191,49 @@ class LlamaModel(TextModel):

        return super().filter_tensors((name, gen))

+    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
+        tensors = super().index_tensors(remote_hf_model_id)
+
+        # Handle Eagle3Speculator nested config
+        if "transformer_layer_config" in self.hparams:
+            self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
+
+        # eagle3 detection
+        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
+            logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
+            new_tensors = {}
+            for name, gen in tensors.items():
+                if name.startswith("midlayer."):
+                    new_name = "model.layers.0." + name[len("midlayer."):]
+                    new_tensors[new_name] = gen
+                elif name.startswith("layers.0."):  # Eagle3Speculator format
+                    new_name = "model." + name
+                    new_tensors[new_name] = gen
+                else:
+                    new_tensors[name] = gen
+            return new_tensors
+
+        return tensors
+
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # eagle3: special tensors that bypass standard llama mapping
+        if getattr(self, 'is_eagle3', False):
+            if name == "fc.weight":
+                yield (name, data_torch)
+                return
+            if name == "d2t":
+                # store for manual int64 handling in prepare_tensors (avoid F32 conversion)
+                if not hasattr(self, '_eagle3_int_tensors'):
+                    self._eagle3_int_tensors = {}
+                self._eagle3_int_tensors[name] = data_torch
+                return
+            if name == "t2d":
+                # not used at runtime, skip
+                return
+            if name.endswith(".hidden_norm.weight"):
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
+                return
+
        n_head = self.find_hparam(["n_heads", "num_attention_heads"])
        n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])

@@ -205,8 +309,33 @@ class LlamaModel(TextModel):
                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))

    def prepare_tensors(self):
+        # eagle3: collect d2t original dtype before parent converts tensors to F32
+        eagle3_original_dtypes = {}
+        if getattr(self, 'is_eagle3', False):
+            for name, data_torch in self.get_tensors():
+                if name == "d2t":
+                    eagle3_original_dtypes[name] = data_torch.dtype
+
        super().prepare_tensors()

+        # eagle3: write d2t as absolute target token ids
+        if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
+            for name, data_torch in self._eagle3_int_tensors.items():
+                old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
+                data = data_torch.to(torch.int64).cpu().numpy()
+                if name == "d2t":
+                    data = data.reshape(-1)
+                    data = data + np.arange(data.size, dtype=np.int64)
+                    if np.any((data < 0) | (data >= self.target_vocab_size)):
+                        raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
+                    if np.unique(data).size != data.size:
+                        raise ValueError("EAGLE-3 d2t contains duplicate target ids")
+                data_qtype = gguf.GGMLQuantizationType.I64
+
+                shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
+                logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
+                self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
+
        if self._experts is not None:
            # flatten `list[dict[str, Tensor]]` into `list[str]`
            experts = [k for d in self._experts for k in d.keys()]
@@ -105,8 +105,9 @@ class MistralModel(LlamaModel):
            gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim)
            gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])

-        if "llama_4_scaling" in hparams:
-            gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
+        llama_4_scaling = hparams.get("llama_4_scaling")
+        if llama_4_scaling is not None:
+            gguf_writer.add_attn_temperature_scale(llama_4_scaling["beta"])


 class MistralMoeModel(DeepseekV2Model):
@@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace:
        help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
    )

+    parser.add_argument(
+        "--target-model-dir", type=str, default=None,
+        help=(
+            "path to the target model directory; required when converting a standalone draft model "
+            "(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
+            "layer count to populate its GGUF."
+        ),
+    )
+
    args = parser.parse_args()
    if not args.print_supported_models and args.model is None:
        parser.error("the following arguments are required: model")
@@ -238,7 +247,7 @@ def main() -> None:
            assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
            from conversion.pixtral import PixtralModel
            model_class = PixtralModel
-        elif "moe" in hparams:
+        elif hparams.get("moe") is not None:
            from conversion.mistral import MistralMoeModel
            model_class = MistralMoeModel
        else:
@@ -269,6 +278,7 @@ def main() -> None:
                                     small_first_shard=args.no_tensor_first_split,
                                     remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
+                                     target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
                                     fuse_gate_up_exps=args.fuse_gate_up_exps,
                                     fp8_as_q8=args.fp8_as_q8,
                                     )
@@ -100,6 +100,7 @@ models = [
    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "tiny_aya",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
+    {"name": "cohere2moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/North-Mini-Code-1.0", },
    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@@ -25,7 +25,7 @@ import gguf
 from gguf.constants import GGUFValueType

 # reuse model definitions from the conversion/ package
-from conversion import LazyTorchTensor, ModelBase, get_model_class
+from conversion import LazyTorchTensor, ModelBase, get_model_class, ModelType, get_model_architecture

 logger = logging.getLogger("lora-to-gguf")

@@ -311,6 +311,10 @@ def parse_args() -> argparse.Namespace:
        "--base-model-id", type=str,
        help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
    )
+    parser.add_argument(
+        "--trust-remote-code", default=False, action="store_true",
+        help="trust remote code in the model",
+    )
    parser.add_argument(
        "lora_path", type=Path,
        help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@@ -319,11 +323,11 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
+def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]:
    from huggingface_hub import try_to_load_from_cache

    # normally, adapter does not come with base model config, we need to load it from AutoConfig
-    config = AutoConfig.from_pretrained(hf_model_id)
+    config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code)
    cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
    cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None

@@ -372,13 +376,13 @@ if __name__ == '__main__':
    # load base model
    if base_model_id is not None:
        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
-        hparams, dir_base_model = load_hparams_from_hf(base_model_id)
+        hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code)
    elif dir_base_model is None:
        if "base_model_name_or_path" in lparams:
            model_id = lparams["base_model_name_or_path"]
            logger.info(f"Loading base model from Hugging Face: {model_id}")
            try:
-                hparams, dir_base_model = load_hparams_from_hf(model_id)
+                hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code)
            except OSError as e:
                logger.error(f"Failed to load base model config: {e}")
                logger.error("Please try downloading the base model and add its path to --base")
@@ -392,10 +396,12 @@ if __name__ == '__main__':
        hparams = ModelBase.load_hparams(dir_base_model, False)

    with torch.inference_mode():
+        model_arch = get_model_architecture(hparams, ModelType.TEXT)
        try:
-            model_class = get_model_class(hparams["architectures"][0])
+            model_class = get_model_class(model_arch)
+            logger.info("Using model architecture: %s", model_arch)
        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            logger.error(f"Model {model_arch} is not supported")
            sys.exit(1)

        class LoraModel(model_class):  # ty: ignore[unsupported-base]
@@ -270,7 +270,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t

 ---

-**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox.
+**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with the toolbox.

 **Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide.

@@ -14,16 +14,17 @@ Legend:

 | Operation | BLAS | CANN | CPU | CUDA | MTL | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
 |-----------|------|------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                        COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
 |                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
@@ -41,25 +42,25 @@ Legend:
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -68,38 +69,38 @@ Legend:
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                 MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                 MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
 |                              PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                         SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                              SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
@@ -107,16 +108,16 @@ Legend:
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
-|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
@@ -582,42 +582,42 @@
 "SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
 "SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
 "SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
-"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
+"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
 "SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
 "SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
 "SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[768,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
@@ -914,57 +914,58 @@
 "SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=0,p1=1","support","1","yes","SYCL"
 "SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=1,p1=0","support","1","yes","SYCL"
 "SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=1,p1=1","support","1","yes","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","0","no","SYCL"
-"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","0","no","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","1","yes","SYCL"
+"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","1","yes","SYCL"
 "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
 "SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
 "SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
+"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[3000,384,1,1],ne_kernel=[3,384,384,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
 "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=0,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
 "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=0,p1=0,d0=3,d1=0,is_2D=0","support","1","yes","SYCL"
 "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=3,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
@@ -1050,6 +1051,8 @@
 "SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
 "SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
 "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
+"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[128,128,1,2],ne_kernel=[32,33,1,2],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
+"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[128,128,2,1],ne_kernel=[33,34,2,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
 "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL"
 "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL"
 "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL"
@@ -5047,6 +5050,39 @@
 "SYCL0","CONV_TRANSPOSE_1D","ne_input=[3,2,1,1],ne_kernel=[3,2,2,1],s0=1,p0=0,d0=1","support","1","yes","SYCL"
 "SYCL0","CONV_TRANSPOSE_1D","ne_input=[3,2,1,1],ne_kernel=[3,1,2,1],s0=1,p0=0,d0=1","support","1","yes","SYCL"
 "SYCL0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f32,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=f16,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL"
+"SYCL0","COL2IM_1D","type=bf16,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL"
 "SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","SYCL"
 "SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","SYCL"
 "SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","SYCL"
@@ -6185,6 +6221,7 @@
 "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=64,n=1,k=64,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=256,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=512,n=1,k=512,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=32,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=4,k=128,bs=[2,3],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
 "SYCL0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
@@ -7603,6 +7640,31 @@
 "SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=16,n_used=16,b=1,m=50,n=200,k=64","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=1,n_used=1,b=0,m=8,n=16,k=1","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=32,n_used=2,b=0,m=2880,n=32,k=2880","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=bf16,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q4_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q4_1,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q5_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q5_1,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q8_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=384","support","0","no","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=192","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q2_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q3_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q4_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q5_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=q6_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq2_xxs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq2_xs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq2_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq3_xxs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq1_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq1_m,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq4_nl,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq3_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
+"SYCL0","MUL_MAT_ID","type_a=iq4_xs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","SYCL"
 "SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","SYCL"
@@ -10845,37 +10907,117 @@
 "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=1","support","1","yes","SYCL"
 "SYCL0","ROPE","type=f16,ne_a=[128,32,2,3],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
+"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","SYCL"
@@ -16515,6 +16657,7 @@
 "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL"
 "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL"
 "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL"
+"SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=256,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL"
 "SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","SYCL"
 "SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q4_0,permute=[0,1,2,3]","support","0","no","SYCL"
 "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=128,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","SYCL"
@@ -175,7 +175,7 @@ int main(int argc, char ** argv) {
                    llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), seq_id));

            if (use_ckpt_dft) {
-                ckpt.update_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+                ckpt.update_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
            }

            // generate a new draft
@@ -196,12 +196,12 @@ int main(int argc, char ** argv) {
            // this allows us to restore the state if partial draft acceptance occurs
            if (!draft.empty()) {
                if (use_ckpt_tgt) {
-                    ckpt.update_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+                    ckpt.update_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
                }
            }

            {
-                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);

                llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
            }
@@ -261,13 +261,13 @@ int main(int argc, char ** argv) {
            draft = std::move(ids);

            {
-                ckpt.load_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+                ckpt.load_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);

                llama_memory_seq_rm(llama_get_memory(ctx_tgt), seq_id, ckpt.pos_max + 1, -1);
            }

            {
-                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);

                llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
            }
@@ -3,15 +3,45 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT

+print_usage() {
+    echo "Usage: ./build.sh [fp32|fp16] [--help]"
+    echo ""
+    echo "Options:"
+    echo "  fp32    Build with FP32 precision (default)"
+    echo "  fp16    Build with FP16 precision (faster for long-prompt inference)"
+    echo "  --help  Print this help message"
+}
+
+PRECISION=fp32
+
+for arg in "$@"; do
+    case "$arg" in
+        --help)
+            print_usage
+            exit 0
+            ;;
+        fp32|fp16)
+            PRECISION="$arg"
+            ;;
+        *)
+            echo "Error: unknown option '$arg'"
+            print_usage
+            exit 1
+            ;;
+    esac
+done
+
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh

-#for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
-
-#for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
+if [ "$PRECISION" = "fp16" ]; then
+    #for FP16
+    cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
+else
+    #for FP32
+    cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
+fi

 #build example/main
 #cmake --build . --config Release --target main
@@ -3,6 +3,23 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT

+IF /I "%1"=="--help" (
+    echo Usage: win-build-sycl.bat [fp32^|fp16] [--help]
+    echo.
+    echo Options:
+    echo   fp32    Build with FP32 precision ^(default^)
+    echo   fp16    Build with FP16 precision ^(faster for long-prompt inference^)
+    echo   --help  Print this help message
+    exit /B 0
+)
+
+SET PRECISION=%1
+IF "%PRECISION%"=="" SET PRECISION=fp32
+IF /I NOT "%PRECISION%"=="fp32" IF /I NOT "%PRECISION%"=="fp16" (
+    echo Error: invalid value '%PRECISION%'. Use 'fp32' or 'fp16'.
+    echo Usage: win-build-sycl.bat [fp32^|fp16] [--help]
+    exit /B 1
+)

 IF not exist build (mkdir build)
 cd build
@@ -11,12 +28,14 @@ if %errorlevel% neq 0 goto ERROR
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 if %errorlevel% neq 0 goto ERROR

-::  for FP16
-::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
-
-::  for FP32
-cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+IF /I "%PRECISION%"=="fp16" (
+    ::  for FP16
+    ::  faster for long-prompt inference
+    cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+) ELSE (
+    ::  for FP32
+    cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+)
 if %errorlevel% neq 0 goto ERROR

 ::  build all binary
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)

 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 13)
+set(GGML_VERSION_MINOR 15)
 set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

@@ -8,10 +8,10 @@ extern "C" {

 #define RPC_PROTO_MAJOR_VERSION    4
 #define RPC_PROTO_MINOR_VERSION    0
-#define RPC_PROTO_PATCH_VERSION    0
+#define RPC_PROTO_PATCH_VERSION    1

 #ifdef  __cplusplus
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
 #endif

 #define GGML_RPC_MAX_SERVERS       16
@@ -535,6 +535,7 @@ extern "C" {
        GGML_OP_IM2COL,
        GGML_OP_IM2COL_BACK,
        GGML_OP_IM2COL_3D,
+        GGML_OP_COL2IM_1D,
        GGML_OP_CONV_2D,
        GGML_OP_CONV_3D,
        GGML_OP_CONV_2D_DW,
@@ -2007,6 +2008,16 @@ extern "C" {
        int                   d1, // dilation dimension 1
        bool                  is_2D);

+    // col2im_1d: scatter-add GEMM columns back to 1D signal
+    // a: [K*OC, T_in]  (columns from matmul, K = a->ne[0]/OC)
+    // result: [T_out, OC]  where T_out = (T_in - 1)*s0 + K - 2*p0
+    GGML_API struct ggml_tensor * ggml_col2im_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,   // columns [K*OC, T_in]
+        int                   s0,  // stride
+        int                   oc,  // output channels
+        int                   p0); // padding to crop from both sides
+
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,   // convolution kernel
@@ -2542,10 +2553,16 @@ extern "C" {
    // TODO: add ggml_gated_delta_net_set_bcast() to be able to configure Q, K broadcast type: tiled vs interleaved [TAG_GGML_GDN_BCAST]
    // ref: https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306
    //
-    // state is a 3D tensor of shape (S_v*S_v*H, K, n_seqs):
-    //   K == 1: output carries the final state only.
-    //   K  > 1: output carries K snapshot slots; the kernel writes the last min(n_tokens, K)
-    //   per-token snapshots into the trailing slots
+    // tensor shapes (S_k == S_v, H_v % H_k == 0):
+    //   q, k  : [S_k, H_k, n_tokens, n_seqs]
+    //   v     : [S_v, H_v, n_tokens, n_seqs]
+    //   g     : [1, H_v, n_tokens, n_seqs] (scalar gate) or [S_v, H_v, n_tokens, n_seqs] (KDA)
+    //   beta  : [1, H_v, n_tokens, n_seqs]
+    //   state : [S_v, S_v, H_v, n_seqs] -- initial recurrent state s0
+    //
+    // the output packs the attention scores [S_v, H_v, n_tokens, n_seqs] followed by K state
+    // snapshots, most-recent first (slot 0 = final state, slot s = state s tokens back). K == 1
+    // keeps only the final state; when n_tokens < K only slots 0..n_tokens-1 are written.
    GGML_API struct ggml_tensor * ggml_gated_delta_net(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -2553,7 +2570,8 @@ extern "C" {
            struct ggml_tensor  * v,
            struct ggml_tensor  * g,
            struct ggml_tensor  * beta,
-            struct ggml_tensor  * state);
+            struct ggml_tensor  * state,
+            int64_t               K);

    // custom operators

@@ -776,8 +776,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
        GGML_ASSERT(src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_1);
        GGML_ASSERT(src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_1);
        GGML_ASSERT(src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_1);
-        // state shape is (S_v*S_v*H, K, n_seqs); the heads dim is nested inside axis 0,
-        // so a head-aligned split on the input cache reshapes to axis 0 here (not axis 2).
+        // state shape is [S_v, S_v, H_v, n_seqs] (s0 only); the heads dim is its own axis 2,
+        // so a head-aligned split on the input cache lands on axis 2 here.
        GGML_ASSERT(src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_2 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_1 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_0);
        return {GGML_BACKEND_SPLIT_AXIS_0, {0}, {1}, 1};
    };
@@ -293,7 +293,6 @@
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__wasm__)
 // quants.c
-#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
@@ -1912,6 +1912,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_im2col_3d(params, tensor);
            } break;
+        case GGML_OP_COL2IM_1D:
+            {
+                ggml_compute_forward_col2im_1d(params, tensor);
+            } break;
        case GGML_OP_CONV_2D:
            {
                ggml_compute_forward_conv_2d(params, tensor);
@@ -2343,6 +2347,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_CONV_2D:
        case GGML_OP_CONV_3D:
        case GGML_OP_CONV_2D_DW:
+        case GGML_OP_COL2IM_1D:
        case GGML_OP_CONV_TRANSPOSE_1D:
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
@@ -2943,7 +2948,7 @@ struct ggml_cplan ggml_graph_plan(
                case GGML_OP_GATED_DELTA_NET:
                    {
                        const int64_t S_v = node->src[2]->ne[0];
-                        const int64_t K   = node->src[5]->ne[1];  // state is (D, K, n_seqs)
+                        const int64_t K   = ggml_get_op_params_i32(node, 0);
                        const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0);
                        cur = per_thread * sizeof(float) * n_tasks;
                    } break;
@@ -38,6 +38,7 @@
 #include "kleidiai.h"

 #include "ggml-cpu.h"
+#include "ggml-cpu-impl.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-threading.h"
@@ -61,7 +62,8 @@ struct ggml_kleidiai_context {
    ggml_kleidiai_kernels * kernels_q8;
    int sme_thread_cap; // <= 0 means “SME disabled/unknown”;
    int thread_hint;    // <= 0 means “no hint”
-} static ctx = { CPU_FEATURE_NONE, nullptr, nullptr, 0, -1 };
+    int chunk_multiplier;
+} static ctx = { CPU_FEATURE_NONE, nullptr, nullptr, 0, -1, 4 };

 static const char* cpu_feature_to_string(cpu_feature f) {
    if (f == CPU_FEATURE_NONE) {
@@ -186,8 +188,9 @@ static void init_kleidiai_context(void) {
    if (!initialized) {
        initialized = true;

-        const char *env_sme     = getenv("GGML_KLEIDIAI_SME");
-        const char *env_threads = getenv("GGML_TOTAL_THREADS");
+        const char *env_sme         = getenv("GGML_KLEIDIAI_SME");
+        const char *env_threads     = getenv("GGML_TOTAL_THREADS");
+        const char *env_chunk_mult  = getenv("GGML_KLEIDIAI_CHUNK_MULTIPLIER");

        const bool cpu_has_sme = ggml_cpu_has_sme();
        size_t detected_smcus = 0;
@@ -204,6 +207,14 @@ static void init_kleidiai_context(void) {
            }
        }

+        if (env_chunk_mult) {
+            bool ok = false;
+            int multiplier = parse_uint_env(env_chunk_mult, "GGML_KLEIDIAI_CHUNK_MULTIPLIER", &ok);
+            if (ok && multiplier > 0) {
+                ctx.chunk_multiplier = multiplier;
+            }
+        }
+
        // SME policy:
        // - If CPU doesn't support SME: SME always off.
        // - Else:
@@ -296,6 +307,50 @@ static inline size_t align_up(size_t value, size_t alignment) {
    return remainder == 0 ? value : value + (alignment - remainder);
 }

+static inline size_t gcd_size(size_t a, size_t b) {
+    while (b != 0) {
+        const size_t t = a % b;
+        a = b;
+        b = t;
+    }
+    return a;
+}
+
+static inline bool lcm_size(size_t a, size_t b, size_t & result) {
+    if (a == 0 || b == 0) {
+        result = 0;
+        return false;
+    }
+    const size_t g = gcd_size(a, b);
+    const size_t q = a / g;
+    if (q > SIZE_MAX / b) {
+        return false;
+    }
+    result = q * b;
+    return true;
+}
+
+static inline size_t ceil_div_size(size_t a, size_t b) {
+    return b == 0 ? 0 : (a + b - 1) / b;
+}
+
+struct kleidiai_block_args {
+    size_t lhs_bl;
+    size_t rhs_bl;
+    size_t pack_bl;
+};
+
+static inline kleidiai_block_args kleidiai_get_block_args(ggml_type rhs_type) {
+    switch (rhs_type) {
+        case GGML_TYPE_Q4_0:
+            return { QK4_0, QK4_0, QK4_0 };
+        case GGML_TYPE_Q8_0:
+            return { 0, 0, QK8_0 };
+        default:
+            return { 0, 0, 0 };
+    }
+}
+
 static inline bool kleidiai_pack_fallback_allowed() {
    if (ctx.sme_thread_cap <= 0) {
        return false;
@@ -746,8 +801,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
            size_t n_step;
            size_t lhs_packed_size;
            size_t lhs_offset;
-            size_t n_offset;
-            size_t n_cols;
+            size_t lhs_bl;
+            size_t rhs_bl;
+            size_t pack_bl;
+            size_t lhs_packed_offset0;
            int assigned_threads;
            int thread_begin;
            int thread_end;
@@ -772,6 +829,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
                continue;
            }

+            const kleidiai_block_args block_args = kleidiai_get_block_args(kernels->rhs_type);
+
            runtime[runtime_count] = {
                slot,
                kernels,
@@ -784,7 +843,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
                kinfo->get_n_step(),
                0,
                0,
-                0,
+                block_args.lhs_bl,
+                block_args.rhs_bl,
+                block_args.pack_bl,
                0,
                0,
                0,
@@ -795,45 +856,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        }

        if (runtime_count == 0) {
-            ggml_kleidiai_kernels * fallback = ggml_kleidiai_select_kernels(ctx.features, dst);
-            if (!fallback) {
-                return false;
-            }
-            kernel_info * kinfo      = is_gemv ? &fallback->gemv : &fallback->gemm;
-            lhs_packing_info * linfo = is_gemv ? &fallback->gemv_lhs_info : &fallback->gemm_lhs_info;
-            rhs_packing_info * rinfo = &fallback->rhs_info;
-            if (!kinfo || !linfo || !linfo->packed_size_ex || !linfo->pack_func_ex ||
-                !kinfo->get_rhs_packed_offset_ex || !kinfo->run_kernel_ex || !kinfo->get_dst_offset ||
-                !rinfo || !rinfo->pack_func_ex || !rinfo->packed_size_ex) {
-                return false;
-            }
-            kernel_chain[0] = fallback;
-            runtime[0] = {
-                0,
-                fallback,
-                kinfo,
-                linfo,
-                kinfo->get_mr(),
-                kinfo->get_nr(),
-                kinfo->get_kr(),
-                kinfo->get_sr(),
-                kinfo->get_n_step(),
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                nullptr
-            };
-            size_t rhs_size_fallback = 0;
-            const uint8_t * rhs_base = weight_for_slot(0, rhs_size_fallback);
-            if (!rhs_base) {
-                rhs_base = static_cast<const uint8_t *>(src0->data);
-            }
-            runtime[0].rhs_base = rhs_base;
-            runtime_count = 1;
+            GGML_LOG_WARN("kleidiai: no runtime kernel slot available for supported op %s\n", dst->name);
+            return false;
        }

        const int nth_total = params->nth > 0 ? params->nth : 1;
@@ -846,6 +870,13 @@ class tensor_traits : public ggml::cpu::tensor_traits {
                break;
            }
        }
+        int non_sme_slot = -1;
+        for (int i = 0; i < runtime_count; ++i) {
+            if ((runtime[i].kernels->required_cpu & CPU_FEATURE_SME) != CPU_FEATURE_SME) {
+                non_sme_slot = i;
+                break;
+            }
+        }

        const int sme_cap_limit = ctx.sme_thread_cap;
        const bool use_hybrid = sme_cap_limit > 0 &&
@@ -864,12 +895,15 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        if (!hybrid_enabled) {
            int chosen_slot = 0;
            if (too_small_for_hybrid && sme_slot != -1) {
-                chosen_slot = sme_slot;
+                chosen_slot = nth_total > sme_cap_limit && non_sme_slot != -1 ? non_sme_slot : sme_slot;
            } else if (runtime_count > 1 && ctx.sme_thread_cap > 0 && nth_total > ctx.sme_thread_cap) {
                chosen_slot = 1;
            }
            if (chosen_slot != 0 && chosen_slot < runtime_count) {
                runtime[0] = runtime[chosen_slot];
+                runtime[0].assigned_threads = 0;
+                runtime[0].thread_begin = 0;
+                runtime[0].thread_end = 0;
            }
            runtime_count = runtime_count > 0 ? 1 : 0;

@@ -896,6 +930,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {

        int fallback_indices[GGML_KLEIDIAI_MAX_KERNEL_SLOTS];
        int fallback_count = 0;
+        // The current hybrid chain is bounded to SME + one non-SME fallback slot.
+        GGML_ASSERT(GGML_KLEIDIAI_MAX_KERNEL_SLOTS == 2);
        for (int i = 0; i < runtime_count; ++i) {
            if (i == sme_slot) {
                continue;
@@ -952,73 +988,67 @@ class tensor_traits : public ggml::cpu::tensor_traits {

        size_t cursor = 0;
        for (int i = 0; i < runtime_count; ++i) {
-            const ggml_type slot_rhs_type = runtime[i].kernels->rhs_type;
-            const size_t slot_pack_size_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 :
-                                              slot_rhs_type == GGML_TYPE_Q8_0 ? QK8_0 : 0;
-            runtime[i].lhs_packed_size = runtime[i].lhs_info->packed_size_ex(m, k, slot_pack_size_arg, runtime[i].mr, runtime[i].kr, runtime[i].sr);
+            runtime[i].lhs_packed_size = runtime[i].lhs_info->packed_size_ex(m, k, runtime[i].pack_bl, runtime[i].mr, runtime[i].kr, runtime[i].sr);
            cursor = align_up(cursor, GGML_KLEIDIAI_PACK_ALIGN);
            runtime[i].lhs_offset = cursor;
+            runtime[i].lhs_packed_offset0 = runtime[i].lhs_info->get_packed_offset_ex(0, k, runtime[i].lhs_bl, runtime[i].mr, runtime[i].kr, runtime[i].sr);
            cursor += runtime[i].lhs_packed_size;
        }

        GGML_ASSERT(cursor <= params->wsize);
        uint8_t * scratch = static_cast<uint8_t *>(params->wdata);

-        size_t assigned_cols = 0;
-        uint64_t weighted_total = 0;
-        if (runtime_count > 1 && sme_slot != -1) {
-            for (int i = 0; i < runtime_count; ++i) {
-                const uint64_t weight = (i == sme_slot) ? (sme_cap << 1) : 1;
-                weighted_total += (uint64_t)runtime[i].assigned_threads * weight;
-            }
-        }
+        size_t common_step = 1;
        for (int i = 0; i < runtime_count; ++i) {
-            runtime[i].n_offset = assigned_cols;
            if (runtime[i].assigned_threads == 0) {
-                runtime[i].n_cols = 0;
                continue;
            }
-            const size_t remaining_cols = n - assigned_cols;
-            if (remaining_cols == 0) {
-                runtime[i].n_cols = 0;
-                continue;
+            size_t next_step = 0;
+            if (!lcm_size(common_step, runtime[i].n_step ? runtime[i].n_step : 1, next_step)) {
+                return false;
            }
-            const size_t step = runtime[i].n_step ? runtime[i].n_step : 1;
-            size_t target      = 0;
-            if (weighted_total > 0) {
-                const uint64_t weight = (i == sme_slot) ? (sme_cap << 1) : 1;
-                target = (size_t)(((uint64_t)n * runtime[i].assigned_threads * weight) / weighted_total);
-            } else {
-                target = (size_t)(((uint64_t)n * runtime[i].assigned_threads) / nth_total);
-            }
-            target             = std::min(target, remaining_cols);
-            size_t aligned     = round_down(target, step);
-            if (aligned == 0 && remaining_cols >= step) {
-                aligned = step;
-            }
-            runtime[i].n_cols = aligned;
-            assigned_cols += aligned;
+            common_step = next_step;
        }
+        GGML_ASSERT(common_step > 0);

-        if (assigned_cols < n) {
-            for (int i = runtime_count - 1; i >= 0; --i) {
-                if (runtime[i].assigned_threads > 0) {
-                    runtime[i].n_cols += n - assigned_cols;
-                    break;
-                }
-            }
+        const bool disable_chunking = ggml_is_numa();
+        const size_t chunk_multiplier = std::max(1, ctx.chunk_multiplier);
+        const size_t chunk_divisor = (nth_total == 1 || disable_chunking) ? (size_t)nth_total : (size_t)nth_total * chunk_multiplier;
+        size_t chunk_cols = align_up(std::max<size_t>(1, ceil_div_size(n, chunk_divisor)), common_step);
+        if (chunk_cols == 0) {
+            chunk_cols = common_step;
        }
+        // If common_step is larger than n, the loop below runs one valid tail chunk
+        // with cols == n.
+        const size_t nchunk_size = std::max<size_t>(1, ceil_div_size(n, chunk_cols));
+        GGML_ASSERT(nchunk_size <= (size_t)INT_MAX);
+        const int nchunk = (int)nchunk_size;
        const size_t dst_stride = dst->nb[1];

+        auto run_chunk = [&](runtime_slot & slot, size_t global_start, size_t cols, uint8_t * dst_batch_base) {
+            const size_t rhs_packed_offset = slot.kernel->get_rhs_packed_offset_ex(global_start, k, slot.rhs_bl);
+            const size_t dst_offset        = slot.kernel->get_dst_offset(0, global_start, dst_stride);
+
+            const uint8_t * lhs_ptr = scratch + slot.lhs_offset + slot.lhs_packed_offset0;
+            const uint8_t * rhs_ptr = slot.rhs_base + rhs_packed_offset;
+            float * dst_ptr         = reinterpret_cast<float *>(dst_batch_base + dst_offset);
+
+            slot.kernel->run_kernel_ex(m, cols, k, slot.rhs_bl,
+                                       lhs_ptr,
+                                       rhs_ptr,
+                                       dst_ptr,
+                                       dst_stride,
+                                       sizeof(float),
+                                       -FLT_MAX,
+                                       FLT_MAX);
+        };
+
        for (int64_t batch_idx = 0; batch_idx < ne12; ++batch_idx) {
            const uint8_t * lhs_batch_base = static_cast<const uint8_t *>(src1->data) + batch_idx * src1->nb[2];
            uint8_t * dst_batch_base = static_cast<uint8_t *>(dst->data) + batch_idx * dst->nb[2];

            if (runtime[local_slot].assigned_threads > 0) {
                runtime_slot & slot = runtime[local_slot];
-                const ggml_type slot_rhs_type = slot.kernels->rhs_type;
-                const size_t slot_lhs_exec_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 :
-                                                 slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0;
                const int64_t m_roundup_mr = kai_roundup((int64_t)m, (int64_t)slot.mr);
                int64_t max_threads = slot.mr ? (m_roundup_mr / (int64_t)slot.mr) : slot.assigned_threads;
                max_threads = std::max<int64_t>(1, max_threads);
@@ -1031,8 +1061,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
                    const int64_t m_start = (int64_t)local_ith * num_m_per_thread0;
                    const int64_t m_count = (local_ith == use_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;

-                    const size_t base_packed_off  = slot.lhs_info->get_packed_offset_ex(m_start, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr);
-                    const size_t next_block_off   = slot.lhs_info->get_packed_offset_ex(m_start + slot.mr, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr);
+                    const size_t base_packed_off  = slot.lhs_info->get_packed_offset_ex(m_start, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr);
+                    const size_t next_block_off   = slot.lhs_info->get_packed_offset_ex(m_start + slot.mr, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr);
                    const size_t row_stride_bytes = slot.mr ? (next_block_off - base_packed_off) / slot.mr : 0;

                    int64_t remaining = m_count;
@@ -1049,7 +1079,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
                        const size_t dst_off = base_packed_off + (size_t)(cur - m_start) * row_stride_bytes;
                        void * dst_ptr       = lhs_packed + dst_off;

-                        slot.lhs_info->pack_func_ex(take, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr, 0, src_ptr, src1->nb[1], dst_ptr);
+                        slot.lhs_info->pack_func_ex(take, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr, 0, src_ptr, src1->nb[1], dst_ptr);

                        cur       += take;
                        remaining -= take;
@@ -1057,49 +1087,29 @@ class tensor_traits : public ggml::cpu::tensor_traits {
                }
            }

+            if (ith_total == 0) {
+                ggml_threadpool_chunk_set(params->threadpool, nth_total);
+            }
+
+            // Publishes both LHS packing and the initialized dynamic chunk queue.
            ggml_barrier(params->threadpool);

            runtime_slot & slot = runtime[local_slot];
-            if (slot.n_cols > 0 && slot.assigned_threads > 0) {
-                int64_t active_threads = slot.assigned_threads;
-                const int64_t max_threads = slot.n_step ? (slot.n_cols / slot.n_step) : slot.assigned_threads;
-                if (max_threads > 0) {
-                    active_threads = std::min<int64_t>(active_threads, std::max<int64_t>(1, max_threads));
+            int current_chunk = ith_total;
+            while (current_chunk < nchunk) {
+                const size_t global_start = (size_t)current_chunk * chunk_cols;
+                if (global_start >= n) {
+                    break;
                }
-                active_threads = std::max<int64_t>(1, active_threads);

-                if (local_ith < active_threads) {
-                    const size_t step = slot.n_step ? slot.n_step : 1;
-                    const size_t chunk0 = round_down((size_t)(slot.n_cols / active_threads), step);
-                    const size_t chunkN = slot.n_cols - (active_threads - 1) * chunk0;
-                    const size_t local_start = (size_t)local_ith * chunk0;
-                    const size_t cols = (local_ith == active_threads - 1) ? chunkN : chunk0;
-
-                    if (cols > 0) {
-                        const ggml_type slot_rhs_type = slot.kernels->rhs_type;
-                        const size_t slot_lhs_exec_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 :
-                                                         slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0;
-                        const size_t slot_rhs_block_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 :
-                                                          slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0;
-                        const size_t global_start = slot.n_offset + local_start;
-                        const size_t lhs_packed_offset = slot.lhs_info->get_packed_offset_ex(0, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr);
-                        const size_t rhs_packed_offset = slot.kernel->get_rhs_packed_offset_ex(global_start, k, slot_rhs_block_arg);
-                        const size_t dst_offset        = slot.kernel->get_dst_offset(0, global_start, dst_stride);
-
-                        const uint8_t * lhs_ptr = scratch + slot.lhs_offset + lhs_packed_offset;
-                        const uint8_t * rhs_ptr = slot.rhs_base + rhs_packed_offset;
-                        float * dst_ptr         = reinterpret_cast<float *>(dst_batch_base + dst_offset);
-
-                        slot.kernel->run_kernel_ex(m, cols, k, slot_rhs_block_arg,
-                                                   lhs_ptr,
-                                                   rhs_ptr,
-                                                   dst_ptr,
-                                                   dst_stride,
-                                                   sizeof(float),
-                                                   -FLT_MAX,
-                                                   FLT_MAX);
-                    }
+                const size_t cols = std::min(chunk_cols, n - global_start);
+                if (cols > 0) {
+                    // KleidiAI GEMM/GEMV kernels accept arbitrary final tail widths;
+                    // only non-tail chunks are guaranteed to be n_step-aligned.
+                    run_chunk(slot, global_start, cols, dst_batch_base);
                }
+
+                current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
            }

            if (batch_idx != ne12 - 1) {
@@ -4008,12 +4008,12 @@ static void ggml_compute_forward_rms_norm_back_f32(
                // dx := scale(dx, rrms)
                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);

-                // dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps)
-                ggml_vec_cpy_f32  (ne00, dx, x);
-                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
-                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
-                ggml_vec_acc_f32  (ne00, dx, dz);
-                ggml_vec_scale_f32(ne00, dx, rrms);
+                // dx[i00] = (dz + x*(-sum_xdz/sum_eps)) * rrms
+                // note: https://github.com/ggml-org/ggml/issues/1491
+                const float scale_x = (float) (-sum_xdz) / sum_eps;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    dx[i00] = (dz[i00] + x[i00] * scale_x) * rrms;
+                }
            }
        }
    }
@@ -6730,6 +6730,78 @@ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
    return (coord  + size) % size; // adding size avoids negative number weirdness
 }

+// ggml_compute_forward_col2im_1d
+//
+// Scatter-add columns [K*OC, T_in] -> signal [T_out, OC]
+// where T_out = (T_in - 1)*s + K - 2*p.  Gather approach: each output reads ceil(K/s) inputs.
+// Parallelized over the time axis so the split stays balanced whatever OC is.
+// Supports F32, F16, BF16 input/output (same type), F32 accumulator.
+
+template <typename elem_t>
+static void ggml_compute_forward_col2im_1d_impl(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src = dst->src[0];  // [K*OC, T_in]
+
+    GGML_ASSERT(ggml_is_contiguous(src));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t OC = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+
+    const int64_t K_OC = src->ne[0];
+    const int64_t T_in = src->ne[1];
+    const int64_t K    = K_OC / OC;
+    const int64_t T_out = dst->ne[0];
+
+    const elem_t * col_data = (const elem_t *) src->data;
+    elem_t       * dst_data = (elem_t *) dst->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // Parallelize over the time axis: the split stays balanced whatever OC is,
+    // down to OC = 1 for mono audio, and threads read disjoint column bands
+    const int64_t dr = (T_out + nth - 1) / nth;
+    const int64_t it0 = dr * ith;
+    const int64_t it1 = it0 + dr < T_out ? it0 + dr : T_out;
+
+    for (int64_t oc = 0; oc < OC; oc++) {
+        for (int64_t t_out = it0; t_out < it1; t_out++) {
+            const int64_t t_abs = t_out + p0;  // absolute position in uncropped signal
+            // Gather: find all (t_in, k) where t_in * s + k == t_abs, 0 <= k < K
+            int64_t t_in_min = (t_abs - K + 1 + s0 - 1) / s0;  // ceil((t_abs-K+1)/s)
+            if (t_in_min < 0) t_in_min = 0;
+            int64_t t_in_max = t_abs / s0;
+            if (t_in_max >= T_in) t_in_max = T_in - 1;
+
+            float sum = 0.0f;
+            for (int64_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
+                int64_t k = t_abs - t_in * s0;
+                if (k >= 0 && k < K) {
+                    // col layout: [K*OC, T_in], element (oc*K+k, t_in)
+                    sum += type_conversion_table<elem_t>::to_f32(col_data[(oc * K + k) + t_in * K_OC]);
+                }
+            }
+            // dst layout: [T_out, OC], element (t_out, oc)
+            dst_data[t_out + oc * T_out] = type_conversion_table<elem_t>::from_f32(sum);
+        }
+    }
+}
+
+void ggml_compute_forward_col2im_1d(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    switch (dst->src[0]->type) {
+        case GGML_TYPE_F32:  ggml_compute_forward_col2im_1d_impl<float>      (params, dst); break;
+        case GGML_TYPE_F16:  ggml_compute_forward_col2im_1d_impl<ggml_fp16_t>(params, dst); break;
+        case GGML_TYPE_BF16: ggml_compute_forward_col2im_1d_impl<ggml_bf16_t>(params, dst); break;
+        default: GGML_ABORT("col2im_1d: unsupported type %d", dst->src[0]->type);
+    }
+}
+
 // ggml_compute_forward_conv_2d


@@ -10552,11 +10624,11 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(

    const bool kda = (neg0 == S_v);

-    // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
-    const int64_t K = src_state->ne[1];
+    // K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs].
+    const int64_t K = ggml_get_op_params_i32(dst, 0);
    GGML_ASSERT(K >= 1);
-    // per-seq stride in floats (slot 0 of seq s lives at state + s * seq_stride)
-    const int64_t state_seq_stride = src_state->nb[2] / sizeof(float);
+    // per-seq stride in floats (seq s starts at state + s * seq_stride)
+    const int64_t state_seq_stride = src_state->nb[3] / sizeof(float);

    const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0);
    const int ith = params->ith;
@@ -10572,9 +10644,8 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
    float * attn_out_base  = (float *)dst->data;
    float * state_out_base = (float *)dst->data + attn_score_elems;

-    // snapshot slot mapping: target_slot = t - shift. When n_tokens < K only the last
-    // n_tokens slots are written; earlier slots are left untouched (caller-owned).
-    const int64_t shift = n_tokens - K;
+    // snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
+    // When n_tokens < K only slots 0..n_tokens-1 are written; older slots are caller-owned.

    const float * state_in_base = (const float *)src_state->data;

@@ -10602,7 +10673,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
            : state_out_base + (iv3 * H + iv1) * S_v * S_v;

        // copy input state into the working buffer and operate in-place
-        // state layout (D, K, n_seqs): slot 0 of seq iv3 starts at iv3 * state_seq_stride.
+        // state layout [S_v, S_v, H, n_seqs]: seq iv3 starts at iv3 * state_seq_stride.
        const float * s_in = state_in_base + iv3 * state_seq_stride + iv1 * S_v * S_v;
        memcpy(s_out, s_in, S_v * S_v * sizeof(float));

@@ -10655,7 +10726,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
            attn_data += S_v * H; // advance to next token

            if (K > 1) {
-                const int64_t target_slot = t - shift;
+                const int64_t target_slot = n_tokens - 1 - t;
                if (target_slot >= 0 && target_slot < K) {
                    float * curr_state_o = state_out_base + target_slot * state_size_per_snap +
                                     (iv3 * H + iv1) * S_v * S_v;
@@ -68,6 +68,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
 void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_col2im_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -1,16 +1,18 @@
 #include "concat.cuh"

+#include <stdint.h>
+
 // contiguous kernels
-template <int dim>
-static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
-                                                                                 const float * y,
-                                                                                 float *       dst,
-                                                                                 int64_t       ne00,
-                                                                                 int64_t       ne01,
-                                                                                 int64_t       ne02,
-                                                                                 int64_t       ne0,
-                                                                                 int64_t       ne1,
-                                                                                 int64_t       ne2) {
+template <typename T, int dim>
+static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_cont(const T * x,
+                                                                             const T * y,
+                                                                             T *       dst,
+                                                                             int64_t   ne00,
+                                                                             int64_t   ne01,
+                                                                             int64_t   ne02,
+                                                                             int64_t   ne0,
+                                                                             int64_t   ne1,
+                                                                             int64_t   ne2) {
    static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");

    const int64_t n = ne0 * ne1 * ne2;
@@ -50,37 +52,37 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont
    }
 }

-static void concat_f32_cuda(const float * x,
-                            const float * y,
-                            float *       dst,
-                            int64_t       ne00,
-                            int64_t       ne01,
-                            int64_t       ne02,
-                            int64_t       ne0,
-                            int64_t       ne1,
-                            int64_t       ne2,
-                            int           dim,
-                            cudaStream_t  stream) {
+template <typename T>
+static void concat_cont_cuda(const T * x,
+                             const T * y,
+                             T *       dst,
+                             int64_t   ne00,
+                             int64_t   ne01,
+                             int64_t   ne02,
+                             int64_t   ne0,
+                             int64_t   ne1,
+                             int64_t   ne2,
+                             int       dim,
+                             cudaStream_t stream) {
    const int64_t n          = ne0 * ne1 * ne2;
    const int     num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;

    if (dim == 0) {
        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream);
-        ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
+        ggml_cuda_kernel_launch(concat_cont<T, 0>, launch_params, x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
        return;
    }
    if (dim == 1) {
-        concat_f32_cont<1>
-            <<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
+        concat_cont<T, 1><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
        return;
    }
-    concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
+    concat_cont<T, 2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
 }

 // non-contiguous kernel (slow)
-template <int dim>
+template <typename T, int dim>
 static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
-    concat_f32_non_cont(
+    concat_non_cont(
        const char * src0,
        const char * src1,
              char * dst,
@@ -107,61 +109,49 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
          uint64_t   nb0,
          uint64_t   nb1,
          uint64_t   nb2,
-          uint64_t   nb3){
+          uint64_t   nb3) {
    static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");

    const int64_t i3 = blockIdx.z;
    const int64_t i2 = blockIdx.y;
    const int64_t i1 = blockIdx.x;

-    const float * x;
+    const T * x;

    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-            x = (const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
+            x = (const T *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
        } else {
            if constexpr (dim == 0) {
-                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
+                x = (const T *)(src1 + i3*nb13 + i2*nb12 + i1*nb11 + (i0 - ne00)*nb10);
            } else if constexpr (dim == 1) {
-                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
+                x = (const T *)(src1 + i3*nb13 + i2*nb12 + (i1 - ne01)*nb11 + i0*nb10);
            } else if constexpr (dim == 2) {
-                x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
+                x = (const T *)(src1 + i3*nb13 + (i2 - ne02)*nb12 + i1*nb11 + i0*nb10);
            } else if constexpr (dim == 3) {
-                x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
+                x = (const T *)(src1 + (i3 - ne03)*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
            }
        }

-        float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+        T * y = (T *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);

        *y = *x;
    }
 }

-
-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    cudaStream_t stream = ctx.stream();
-
-    const int32_t dim = ((int32_t *) dst->op_params)[0];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-
+template <typename T>
+static void concat_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, int dim, cudaStream_t stream) {
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-        const float * src0_d = (const float *)src0->data;
-        const float * src1_d = (const float *)src1->data;
-
-        float * dst_d = (float *)dst->data;
+        const T * src0_d = (const T *) src0->data;
+        const T * src1_d = (const T *) src1->data;
+        T *       dst_d  = (T *) dst->data;

        if (dim != 3) {
-            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-                concat_f32_cuda(
-                        src0_d + i3 * (src0->nb[3] / 4),
-                        src1_d + i3 * (src1->nb[3] / 4),
-                        dst_d + i3 * ( dst->nb[3] / 4),
+            for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) {
+                concat_cont_cuda(
+                        src0_d + i3*(src0->nb[3] / sizeof(T)),
+                        src1_d + i3*(src1->nb[3] / sizeof(T)),
+                        dst_d  + i3*( dst->nb[3] / sizeof(T)),
                        src0->ne[0], src0->ne[1], src0->ne[2],
                        dst->ne[0],  dst->ne[1],  dst->ne[2], dim, stream);
            }
@@ -169,13 +159,13 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
            const size_t size0 = ggml_nbytes(src0);
            const size_t size1 = ggml_nbytes(src1);

-            CUDA_CHECK(cudaMemcpyAsync(dst_d,           src0_d, size0, cudaMemcpyDeviceToDevice, stream));
-            CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync((char *) dst->data,         src0->data, size0, cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync((char *) dst->data + size0, src1->data, size1, cudaMemcpyDeviceToDevice, stream));
        }
    } else {
        dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
        auto launch_kernel = [&](auto dim) {
-            concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
+            concat_non_cont<T, dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
                (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
                src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
@@ -203,3 +193,35 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        }
    }
 }
+
+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    cudaStream_t stream = ctx.stream();
+
+    const int32_t dim = ((int32_t *) dst->op_params)[0];
+
+    GGML_ASSERT(src0->type == src1->type);
+    GGML_ASSERT(dst->type  == src0->type);
+    GGML_ASSERT(!ggml_is_quantized(src0->type));
+    GGML_ASSERT(ggml_blck_size(src0->type) == 1);
+
+    switch (ggml_type_size(src0->type)) {
+        case 1:
+            concat_cuda<uint8_t>(src0, src1, dst, dim, stream);
+            break;
+        case 2:
+            concat_cuda<uint16_t>(src0, src1, dst, dim, stream);
+            break;
+        case 4:
+            concat_cuda<uint32_t>(src0, src1, dst, dim, stream);
+            break;
+        case 8:
+            concat_cuda<uint64_t>(src0, src1, dst, dim, stream);
+            break;
+        default:
+            GGML_ABORT("Unsupported type size: %zu", ggml_type_size(src0->type));
+            break;
+    }
+}
@@ -39,9 +39,9 @@ gated_delta_net_cuda(const float * q,
    float *       attn_data        = dst;
    float *       state            = dst + attn_score_elems;

-    // input state layout (D, K, n_seqs) — seq stride is K * D = K * H * S_v * S_v.
+    // input state holds s0 only: [S_v, S_v, H, n_seqs] — seq stride is D = H * S_v * S_v.
    // output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
-    const int64_t state_in_offset      = sequence * K * H * S_v * S_v + h_idx * S_v * S_v;
+    const int64_t state_in_offset      = sequence * H * S_v * S_v + h_idx * S_v * S_v;
    const int64_t state_out_offset     = (sequence * H + h_idx) * S_v * S_v;
    state += state_out_offset;
    curr_state += state_in_offset + col * S_v;
@@ -143,12 +143,10 @@ gated_delta_net_cuda(const float * q,
        attn_data += S_v * H;

        if constexpr (keep_rs_t) {
-            // slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
-            // are written; earlier slots are left untouched (caller-owned).
-            const int shift = (int) n_tokens - K;
-
+            // snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
+            // When n_tokens < K only slots 0..n_tokens-1 are written; older slots are caller-owned.
            const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
-            const int target_slot = t - shift;
+            const int target_slot = (int) n_tokens - 1 - t;
            if (target_slot >= 0 && target_slot < K) {
                float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
 #pragma unroll
@@ -286,8 +284,8 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *

    cudaStream_t stream = ctx.stream();

-    // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
-    const int K = (int) src_state->ne[1];
+    // K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs].
+    const int K = ggml_get_op_params_i32(dst, 0);
    const bool keep_rs = K > 1;

    if (kda) {
@@ -622,6 +622,18 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {

 // cuda buffer

+struct ggml_backend_cuda_device_context {
+    int device;
+    std::string name;
+    std::string description;
+    std::string pci_bus_id;
+    int op_offload_min_batch_size;
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    std::mutex device_mutex;
+    int active_count = 0;
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+};
+
 struct ggml_backend_cuda_buffer_context {
    int device;
    void * dev_ptr = nullptr;
@@ -639,6 +651,13 @@ struct ggml_backend_cuda_buffer_context {

 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
+    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
+    dev_ctx->active_count--;
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
    delete ctx;
 }

@@ -791,6 +810,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac

    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);

+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
+    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
+    dev_ctx->active_count++;
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 }

@@ -1490,6 +1515,12 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
 }

 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
+    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
+    dev_ctx->active_count--;
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
    CUDA_CHECK(cudaFreeHost(buffer->context));
 }

@@ -1498,6 +1529,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
        return nullptr;
    }

+    ggml_cuda_set_device(0); // cudaMallocHost can create the implicit CUDA device context, make sure that this is consistently done on device 0.
+
    void * ptr = nullptr;
    cudaError_t err = cudaMallocHost((void **) &ptr, size);
    if (err != cudaSuccess) {
@@ -1523,6 +1556,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
    buffer->buft = buft;
    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;

+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
+    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
+    dev_ctx->active_count++;
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
    return buffer;
 }

@@ -3140,6 +3179,12 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
 static void ggml_backend_cuda_free(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) backend->device->context;
+    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
+    dev_ctx->active_count--;
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
    delete cuda_ctx;
    delete backend;
 }
@@ -4871,14 +4916,6 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {

 // backend device

-struct ggml_backend_cuda_device_context {
-    int device;
-    std::string name;
-    std::string description;
-    std::string pci_bus_id;
-    int op_offload_min_batch_size;
-};
-
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    return ctx->name.c_str();
@@ -4967,6 +5004,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k

 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    std::lock_guard<std::mutex> lock(ctx->device_mutex);
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaMemGetInfo(free, total));

@@ -4993,6 +5035,13 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
    }
 #endif // defined(__linux__)

+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    // If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA
+    // context that permanently consumes VRAM. Reset the device to free it.
+    if (ctx->active_count == 0) {
+        CUDA_CHECK(cudaDeviceReset());
+    }
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 }

 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
@@ -5288,15 +5337,24 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            } break;
        case GGML_OP_REPEAT:
            {
+                // the CUDA REPEAT path only implements F32/F16; other types assert at runtime
                ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+                return src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16;
            } break;
        case GGML_OP_REPEAT_BACK:
                return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
        case GGML_OP_CONCAT:
            {
                ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+                ggml_type src1_type = op->src[1]->type;
+                return src0_type == src1_type &&
+                       src0_type == op->type &&
+                       !ggml_is_quantized(src0_type) &&
+                       ggml_blck_size(src0_type) == 1 &&
+                       (ggml_type_size(src0_type) == 1 ||
+                        ggml_type_size(src0_type) == 2 ||
+                        ggml_type_size(src0_type) == 4 ||
+                        ggml_type_size(src0_type) == 8);
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
@@ -5687,13 +5745,21 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
        return nullptr;
    }

+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device);
+
    ggml_backend_t cuda_backend = new ggml_backend {
        /* .guid    = */ ggml_backend_cuda_guid(),
        /* .iface   = */ ggml_backend_cuda_interface,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
+        /* .device  = */ dev,
        /* .context = */ ctx,
    };

+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
+    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
+    dev_ctx->active_count++;
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
    return cuda_backend;
 }

@@ -411,7 +411,6 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
                case GGML_TYPE_Q5_0:
                case GGML_TYPE_Q5_1:
                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q4_K:
                    return 8;
                case GGML_TYPE_Q6_K:
                    return 2;
@@ -682,12 +681,16 @@ static __global__ void mul_mat_vec_q(
 template <ggml_type type, int c_rows_per_block>
 __launch_bounds__(get_mmvq_mmid_max_batch_for_device<type>()*ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void mul_mat_vec_q_moe(
-        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids,
-        float * __restrict__ dst,
+        const void * vx_ptr, const void * vy_ptr, const int32_t * ids_ptr,
+        float * dst_ptr,
        const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x,
        const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst,
        const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst,
        const uint32_t ncols_dst, const uint32_t ids_stride) {
+    const void    * GGML_CUDA_RESTRICT vx  = vx_ptr;
+    const void    * GGML_CUDA_RESTRICT vy  = vy_ptr;
+    const int32_t * GGML_CUDA_RESTRICT ids = ids_ptr;
+    float         * GGML_CUDA_RESTRICT dst = dst_ptr;

    constexpr int qk  = ggml_cuda_type_traits<type>::qk;
    constexpr int qi  = ggml_cuda_type_traits<type>::qi;
@@ -707,6 +710,7 @@ static __global__ void mul_mat_vec_q_moe(
        return;
    }

+    ggml_cuda_pdl_sync();
    const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride];
    const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y);

@@ -726,6 +730,8 @@ static __global__ void mul_mat_vec_q_moe(
        }
    }

+    ggml_cuda_pdl_lc();
+
    // Warp-level reduction only - no shared memory needed
 #pragma unroll
    for (int i = 0; i < c_rows_per_block; ++i) {
@@ -794,8 +800,9 @@ static void mul_mat_vec_q_moe_launch(
    const int64_t nblocks_rows = (nrows_x + rows_per_block - 1) / rows_per_block;
    const dim3 block_nums(nblocks_rows, nchannels_dst);
    const dim3 block_dims(warp_size, ncols_dst);
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);

-    mul_mat_vec_q_moe<type, rows_per_block><<<block_nums, block_dims, 0, stream>>>(
+    ggml_cuda_kernel_launch(mul_mat_vec_q_moe<type, rows_per_block>, launch_params,
        vx, vy, ids, dst, ncols_x, nchannels_y, nrows_x,
        stride_row_x, stride_col_y, stride_col_dst,
        stride_channel_x, stride_channel_y, stride_channel_dst,
@@ -67,6 +67,7 @@ __global__ void __launch_bounds__(splitD, 1)
    __shared__ CubTempStorage cub_temp_storage;

    BlockLoad(cub_temp_storage.load_temp).Load(A_block, regA);
+    __syncthreads();
    BlockLoad(cub_temp_storage.load_temp).Load(s0_block, regs0);
 #else
    const int stride_s0 = src0_nb2 / sizeof(float);
@@ -105,6 +106,7 @@ __global__ void __launch_bounds__(splitD, 1)
            regs0[n] = state;
        }
        y_block[i * stride_y + threadIdx.x] = sumf;
+        __syncthreads();
    }

 #ifdef USE_CUB
@@ -249,9 +251,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
        GGML_ASSERT(head_dim == 1);
        GGML_ASSERT(n_group == 1);
        const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1);
-        const int  smem_size = (threads * (d_state + 1) * 2) * sizeof(float);
        if (d_state == 16) {
-            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, smem_size, stream);
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream);
            switch (n_tok)
            {
            case 1:
@@ -219,9 +219,9 @@
 #define RDNA3
 #endif // defined(__GFX11__)

-#if defined(__gfx1150__) || defined(__gfx1151__)
+#if defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1152__) || defined(__gfx1153__)
 #define RDNA3_5
-#endif // defined(__gfx1150__) || defined(__gfx1151__)
+#endif // defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1152__) || defined(__gfx1153__)

 #if defined(RDNA3) && !defined(RDNA3_5)
 #define RDNA3_0
@@ -2538,7 +2538,7 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses
    const int64_t H        = v->ne[1];
    const int64_t n_tokens = v->ne[2];
    const int64_t n_seqs   = v->ne[3];
-    const int64_t K        = state->ne[1];
+    const int64_t K        = ggml_get_op_params_i32(op, 0);

    if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
        return false;
@@ -2551,7 +2551,8 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses
    if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
        return false;
    }
-    if (ggml_nelements(state) != S_v * S_v * H * n_seqs * K) {
+    // state holds s0 only [S_v, S_v, H, n_seqs]; K is op param 0.
+    if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
        return false;
    }
    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
@@ -584,7 +584,7 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
    const uint32_t H        = v->ne[1];
    const uint32_t n_tokens = v->ne[2];
    const uint32_t n_seqs   = v->ne[3];
-    const uint32_t K        = state->ne[1];
+    const uint32_t K        = octx->op_params[0];

    const uint32_t total_rows = H * n_seqs;
    if (ith >= total_rows) {
@@ -618,9 +618,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
    struct fastdiv_values fd_rq3 = init_fastdiv_values(rq3);
    struct fastdiv_values fd_rk3 = init_fastdiv_values(rk3);

-    const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
+    const uint64_t state_seq_stride = state->nb[3] / sizeof(float);
    const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
-    const int64_t shift = (int64_t) n_tokens - (int64_t) K;

    uint32_t ir_prefetch = ith;
    int spad_idx = 0;
@@ -630,7 +629,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
        const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
        const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
        const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
-        float * ps_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) piv3 * H + piv1) * S_v * S_v;
+        // final state lands in snapshot slot 0 (most-recent-first ordering)
+        float * ps_out = state_out_base + ((uint64_t) piv3 * H + piv1) * S_v * S_v;

        // Push dummy write-back
        dma_queue_push(dma, dma_make_ptr(ps_out, s_work[spad_idx]),
@@ -661,7 +661,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
        const uint32_t iq3 = fastdiv(iv3, &fd_rq3);
        const uint32_t ik3 = fastdiv(iv3, &fd_rk3);

-        float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        // final state lands in snapshot slot 0 (most-recent-first ordering)
+        float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;

        float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v;

@@ -792,7 +793,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
            }

            if (K > 1) {
-                const int64_t target_slot = (int64_t) t - shift;
+                // snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
+                const int64_t target_slot = (int64_t) n_tokens - 1 - (int64_t) t;
                if (target_slot >= 0 && target_slot < (int64_t) K) {
                    float * curr_state_o = state_out_base + (uint64_t) target_slot * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
                    if (curr_state_o != s_out) {
@@ -844,7 +846,6 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
    const uint32_t S_v      = v->ne[0];
    const uint32_t H        = v->ne[1];
    const uint32_t n_seqs   = v->ne[3];
-    const uint32_t K        = state->ne[1];

    const uint32_t total_rows = H * n_seqs;
    if (ith >= total_rows) {
@@ -878,8 +879,7 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
    struct fastdiv_values fd_rq3 = init_fastdiv_values(rq3);
    struct fastdiv_values fd_rk3 = init_fastdiv_values(rk3);

-    const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
-    const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
+    const uint64_t state_seq_stride = state->nb[3] / sizeof(float);

    uint32_t ir_prefetch = ith;
    int spad_idx = 0;
@@ -889,7 +889,8 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
        const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
        const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
        const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
-        float * ps_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) piv3 * H + piv1) * S_v * S_v;
+        // final state lands in snapshot slot 0 (most-recent-first ordering)
+        float * ps_out = state_out_base + ((uint64_t) piv3 * H + piv1) * S_v * S_v;

        // Push dummy write-back
        dma_queue_push(dma, dma_make_ptr(ps_out, s_work[spad_idx]),
@@ -920,7 +921,8 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
        const uint32_t iq3 = fastdiv(iv3, &fd_rq3);
        const uint32_t ik3 = fastdiv(iv3, &fd_rk3);

-        float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        // final state lands in snapshot slot 0 (most-recent-first ordering)
+        float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;

        float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v;

@@ -1097,7 +1099,7 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
    const uint32_t H        = v->ne[1];
    const uint32_t n_tokens = v->ne[2];
    const uint32_t n_seqs   = v->ne[3];
-    const uint32_t K        = state->ne[1];
+    const uint32_t K        = octx->op_params[0];

    if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
        return HTP_STATUS_NO_SUPPORT;
@@ -1110,7 +1112,8 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
        (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
        return HTP_STATUS_NO_SUPPORT;
    }
-    if (state->ne[0] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
+    // state holds s0 only: [S_v, S_v, H, n_seqs]
+    if (state->ne[0] != S_v || state->ne[1] != S_v || state->ne[2] != H || state->ne[3] != n_seqs) {
        return HTP_STATUS_NO_SUPPORT;
    }
    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
@@ -590,8 +590,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net(
    const int ne20 = op->src[2]->ne[0]; // S_v
    const int ne21 = op->src[2]->ne[1]; // H
    const int ne30 = op->src[3]->ne[0]; // G
-    // state is src[5], 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
-    const int K = op->src[5]->ne[1];
+    // state is src[5], 4D [S_v, S_v, H_v, n_seqs] (s0 only); K is op param 0.
+    const int K = ggml_get_op_params_i32(op, 0);

    const int nsg = op->src[2]->ne[0]/32;

@@ -1738,10 +1738,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_meta
    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
    GGML_ASSERT(op->type         == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);

+    const bool is_2D = ((const int32_t *)(op->op_params))[6] == 1;
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
    char base[256];
    char name[256];

-    if (ne00*ne01 <= 1024) {
+    if (KH*KW <= 1024) {
        snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
    } else {
        snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type));
@@ -1120,8 +1120,17 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
        case GGML_OP_VIEW:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_PERMUTE:
-        case GGML_OP_CONCAT:
            return true;
+        case GGML_OP_CONCAT:
+            {
+                // kernel_concat copies one float-sized value per element.
+                // Other scalar types need a type-generic copy kernel first.
+                const enum ggml_type src0_type = op->src[0]->type;
+                const enum ggml_type src1_type = op->src[1]->type;
+                return src0_type == src1_type &&
+                       src0_type == op->type &&
+                       (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_I32);
+            }
        case GGML_OP_ADD:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
@@ -1418,6 +1418,9 @@ typedef decltype(kernel_repeat<float>) kernel_repeat_t;

 template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
 template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
+#if defined(GGML_METAL_HAS_BF16)
+template [[host_name("kernel_repeat_bf16")]] kernel kernel_repeat_t kernel_repeat<bfloat>;
+#endif
 template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
 template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;

@@ -2599,9 +2602,9 @@ kernel void kernel_gated_delta_net_impl(

    const float scale = 1.0f / sqrt((float)S_v);

-    // input state layout (D, K, n_seqs): per-seq stride is K*H*D; we read slot 0.
+    // input state layout [S_v, S_v, H, n_seqs] (s0 only): per-seq stride is H*D.
    // state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous
-    const uint state_in_base = (i23*K*args.ne21 + i21)*S_v*S_v + i20*S_v;
+    const uint state_in_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
    device const float * s_ptr = (device const float *) (s) + state_in_base;

    float ls[NSG];
@@ -2620,9 +2623,8 @@ kernel void kernel_gated_delta_net_impl(
    device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21);
    device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G;

-    // snapshot slot mapping: target_slot = t - shift. When n_tokens < K, only the last
-    // n_tokens slots are written; earlier slots are left untouched (caller-owned).
-    const int shift = (int)args.ne22 - (int)K;
+    // snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
+    // When n_tokens < K, only slots 0..n_tokens-1 are written; older slots are caller-owned.

    // output state base offset: after attention scores
    const uint attn_size = args.ne22 * args.ne21 * S_v * args.ne23;
@@ -2680,7 +2682,7 @@ kernel void kernel_gated_delta_net_impl(
        g_ptr += args.ne21*G;

        if (K > 1) {
-            const int target_slot = (int)t - shift;
+            const int target_slot = (int)args.ne22 - 1 - (int)t;
            if (target_slot >= 0 && target_slot < (int)K) {
                device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base;
                FOR_UNROLL (short j = 0; j < NSG; j++) {
@@ -142,6 +142,10 @@ set(GGML_OPENCL_KERNELS
    gemm_noshuffle_q4_0_f32
    gemv_noshuffle_q4_1_f32
    gemm_noshuffle_q4_1_f32
+    gemv_noshuffle_q5_0_f32
+    gemm_noshuffle_q5_0_f32
+    gemv_noshuffle_q5_1_f32
+    gemm_noshuffle_q5_1_f32
    gemv_noshuffle_iq4_nl_f32
    gemm_noshuffle_iq4_nl_f32
    gemv_noshuffle_q8_0_f32
@@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
    cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
    cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
-    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
+    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
    cl_kernel kernel_mul_mat_f32_f32;
    cl_kernel kernel_mul_mat_f16_f16;
    cl_kernel kernel_mul_mat_f16_f32_1row;
@@ -593,6 +593,10 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_restore_block_q4_0_noshuffle;
    cl_kernel kernel_convert_block_q4_1_noshuffle;
    cl_kernel kernel_restore_block_q4_1_noshuffle;
+    cl_kernel kernel_convert_block_q5_0_noshuffle;
+    cl_kernel kernel_restore_block_q5_0_noshuffle;
+    cl_kernel kernel_convert_block_q5_1_noshuffle;
+    cl_kernel kernel_restore_block_q5_1_noshuffle;
    cl_kernel kernel_convert_block_q4_K_noshuffle;
    cl_kernel kernel_restore_block_q4_K_noshuffle;
    cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
@@ -639,7 +643,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
    cl_kernel kernel_upscale;
    cl_kernel kernel_upscale_bilinear;
-    cl_kernel kernel_concat_f32;
+    cl_kernel kernel_concat_f32, kernel_concat_f32_pack;
    cl_kernel kernel_conv_2d_f16;
    cl_kernel kernel_conv_2d_f32;
    cl_kernel kernel_conv_2d_f16_f32;
@@ -829,6 +833,10 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_gemm_noshuffle_q6_K_f32;
    cl_kernel kernel_gemv_noshuffle_q5_k_f32;
    cl_kernel kernel_gemm_noshuffle_q5_k_f32;
+    cl_kernel kernel_gemv_noshuffle_q5_0_f32;
+    cl_kernel kernel_gemm_noshuffle_q5_0_f32;
+    cl_kernel kernel_gemv_noshuffle_q5_1_f32;
+    cl_kernel kernel_gemm_noshuffle_q5_1_f32;
    cl_kernel kernel_gemv_noshuffle_iq4_nl_f32;
    cl_kernel kernel_gemm_noshuffle_iq4_nl_f32;
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
@@ -1121,6 +1129,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
        CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
        CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
        CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
        GGML_LOG_CONT(".");
    }
@@ -1151,6 +1160,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        CL_CHECK((backend_ctx->kernel_restore_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_trans4_ns", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q5_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0", &err), err));
        CL_CHECK((backend_ctx->kernel_restore_block_q5_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0", &err), err));
+        CL_CHECK((backend_ctx->kernel_convert_block_q5_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_noshuffle", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q5_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_noshuffle", &err), err));
+        CL_CHECK((backend_ctx->kernel_convert_block_q5_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_noshuffle", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q5_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_noshuffle", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_trans4_ns", &err), err));
        CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q5_1  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1", &err), err));
@@ -2615,6 +2628,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        cl_program prog =
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
        CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
        CL_CHECK(clReleaseProgram(prog));
        GGML_LOG_CONT(".");
    }
@@ -3063,6 +3077,80 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        GGML_LOG_CONT(".");
    }

+    // gemm_noshuffle_q5_0_f32
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemm_noshuffle_q5_0_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemm_noshuffle_q5_0_f32.cl");
+#endif
+        cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_0_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_0_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemv_noshuffle_q5_0_f32
+    {
+        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
+                                       " -cl-mad-enable ";
+        if (backend_ctx->has_vector_subgroup_broadcast) {
+            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
+        }
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemv_noshuffle_q5_0_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemv_noshuffle_q5_0_f32.cl");
+#endif
+        cl_program prog = build_program_from_source(
+            backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
+        CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_0_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemm_noshuffle_q5_1_f32
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemm_noshuffle_q5_1_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemm_noshuffle_q5_1_f32.cl");
+#endif
+        cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_1_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_1_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemv_noshuffle_q5_1_f32
+    {
+        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
+                                       " -cl-mad-enable ";
+        if (backend_ctx->has_vector_subgroup_broadcast) {
+            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
+        }
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemv_noshuffle_q5_1_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemv_noshuffle_q5_1_f32.cl");
+#endif
+        cl_program prog = build_program_from_source(
+            backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
+        CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_1_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_1_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
    // gemm_noshuffle_iq4_nl_f32
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -6105,15 +6193,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
            return;
        }
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
-            cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0;
-            cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_kernels(backend_ctx, tensor)) {
+            cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0_noshuffle;
            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
-            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk));

-            size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1};
+            size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
            size_t local_work_size[] = {64, 1, 1};

            cl_event evt;
@@ -6122,7 +6211,39 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
            CL_CHECK(clReleaseMemObject(data_device));

            tensor->extra = extra;
+
+            int M = tensor->ne[1];
+            int K = tensor->ne[0];
+            GGML_ASSERT(K % 32 == 0);
+
+            // Transpose qs as ushort
+            transpose_2d_as_16b(backend_ctx, extra->qs, extra->qs, size_qs, K/4, M);
+            // Transpose qh as uchar
+            transpose_2d_as_8b(backend_ctx, extra->qh, extra->qh, size_qh, K/8, M);
+            // Transpose d as ushort
+            transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
+
            return;
+        }
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0;
+        cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk));
+
+        size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clReleaseMemObject(data_device));
+
+        tensor->extra = extra;
+        return;
    }
    if (tensor->type == GGML_TYPE_Q5_1) {
        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
@@ -6223,6 +6344,42 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
            return;
        }
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_kernels(backend_ctx, tensor)) {
+            cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1_noshuffle;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->m));
+
+            size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+            size_t local_work_size[] = {64, 1, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clReleaseMemObject(data_device));
+
+            tensor->extra = extra;
+
+            int M = tensor->ne[1];
+            int K = tensor->ne[0];
+            GGML_ASSERT(K % 32 == 0);
+
+            // Transpose qs as ushort
+            transpose_2d_as_16b(backend_ctx, extra->qs, extra->qs, size_qs, K/4, M);
+            // Transpose qh as uchar
+            transpose_2d_as_8b(backend_ctx, extra->qh, extra->qh, size_qh, K/8, M);
+            // Transpose d as ushort
+            transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
+            // Transpose m as ushort
+            transpose_2d_as_16b(backend_ctx, extra->m, extra->m, size_m, K/32, M);
+
+            return;
+        }
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
        cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1;
        cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
@@ -7297,6 +7454,48 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
            CL_CHECK(clReleaseMemObject(data_device));
            return;
        }
+        if (use_adreno_kernels(backend_ctx, tensor)) {
+            ggml_cl_buffer buf_trans_qs;
+            ggml_cl_buffer buf_trans_qh;
+            ggml_cl_buffer buf_trans_d;
+            ggml_cl_buffer buf_unpacked;
+
+            cl_int M = tensor->ne[1];
+            cl_int K = tensor->ne[0];
+
+            GGML_ASSERT(K % 32 == 0);
+
+            size_t size_qs = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
+            size_t size_qh = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(int32_t);
+            size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
+
+            buf_trans_qs.allocate(backend_ctx->context, size_qs);
+            buf_trans_qh.allocate(backend_ctx->context, size_qh);
+            buf_trans_d.allocate(backend_ctx->context, size_d);
+            buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
+
+            transpose_2d_as_16b(backend_ctx, extra->qs, buf_trans_qs.buffer, size_qs, M, K/4);
+            transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8);
+            transpose_2d_as_16b(backend_ctx, extra->d,  buf_trans_d.buffer,  size_d,  M, K/32);
+
+            cl_uchar mask_0F = 0x0F;
+            cl_uchar mask_F0 = 0xF0;
+
+            size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+            size_t local_work_size[] = {1, 1, 1};
+
+            cl_kernel kernel = backend_ctx->kernel_restore_block_q5_0_noshuffle;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &buf_trans_qs.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),   &buf_trans_qh.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &buf_trans_d.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem),   &buf_unpacked.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_F0));
+
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+            CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
+            return;
+        }
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS

        cl_int err;
@@ -7360,6 +7559,54 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
            CL_CHECK(clReleaseMemObject(data_device));
            return;
        }
+
+        if (use_adreno_kernels(backend_ctx, tensor)) {
+            ggml_cl_buffer buf_trans_qs;
+            ggml_cl_buffer buf_trans_qh;
+            ggml_cl_buffer buf_trans_d;
+            ggml_cl_buffer buf_trans_m;
+            ggml_cl_buffer buf_unpacked;
+
+            cl_int M = tensor->ne[1];
+            cl_int K = tensor->ne[0];
+            GGML_ASSERT(K % 32 == 0);
+
+            size_t size_qs = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
+            size_t size_qh = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(int32_t);
+            size_t size_d  = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
+            size_t size_m  = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
+
+            buf_trans_qs.allocate(backend_ctx->context, size_qs);
+            buf_trans_qh.allocate(backend_ctx->context, size_qh);
+            buf_trans_d.allocate(backend_ctx->context, size_d);
+            buf_trans_m.allocate(backend_ctx->context, size_m);
+            buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
+
+            // Transpose back: from col-major to row-major
+            transpose_2d_as_16b(backend_ctx, extra->qs, buf_trans_qs.buffer, size_qs, M, K/4);
+            transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8);
+            transpose_2d_as_16b(backend_ctx, extra->d,  buf_trans_d.buffer,  size_d,  M, K/32);
+            transpose_2d_as_16b(backend_ctx, extra->m,  buf_trans_m.buffer,  size_m,  M, K/32);
+
+            cl_uchar mask_0F = 0x0F;
+            cl_uchar mask_F0 = 0xF0;
+
+            size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+            size_t local_work_size[] = {1, 1, 1};
+
+            cl_kernel kernel = backend_ctx->kernel_restore_block_q5_1_noshuffle;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &buf_trans_qs.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),   &buf_trans_qh.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &buf_trans_d.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem),   &buf_trans_m.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &buf_unpacked.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
+
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+            CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
+            return;
+        }
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
        cl_int err;
        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
@@ -8552,7 +8799,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
        nth *= 2;
    }

-    size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
+    int nchunks = 1;
+    if (src0->type == GGML_TYPE_F32) {
+        const int chunk_target = nth * 4;
+        nchunks = (ne00 + chunk_target - 1) / chunk_target;
+        nchunks = MAX(1, MIN(nchunks, 64));
+    }
+
+    size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
    size_t local_work_size[] = {(size_t)nth, 1, 1};

    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -11128,7 +11382,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con

    int nth = MIN(64, ne0);

-    cl_kernel kernel = backend_ctx->kernel_concat_f32;
+    const bool concat_pack = (dim == 0 && ne0 < 32);
+    cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
+                                   : backend_ctx->kernel_concat_f32;

    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
@@ -11155,10 +11411,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int),   &dim));

-    size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
-    size_t local_work_size[] = {(size_t)nth, 1, 1};
+    if (concat_pack) {
+        // packed kernel needs the dst dims to unflatten its 1-D row index.
+        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
+        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
+        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));

-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
+        const int base  = MIN(64, maxwg);
+        const int tpr   = MIN(ne0, base);                 // threads per row
+        const int rpw   = MAX(1, base / tpr);             // rows per workgroup
+        const int lsz   = tpr * rpw;
+        const int nrows = ne1*ne2*ne3;
+        const int nwg   = (nrows + rpw - 1) / rpw;
+        size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
+        size_t local_work_size[]  = {(size_t)lsz, 1, 1};
+        backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
+    } else {
+        size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
+        size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    }
 }

 static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -12176,6 +12450,368 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t
 #endif
 }

+static void ggml_cl_mul_mat_q5_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+    ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra;
+
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+
+    const int ne1 = dst->ne[1];
+
+    GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+
+    cl_context context = backend_ctx->context;
+    cl_kernel kernel;
+
+    cl_int              err;
+    cl_image_format     img_fmt;
+    cl_image_desc       img_desc;
+    cl_buffer_region    region;
+
+    int M = ne01;
+    int N = ne1;
+    int K = ne00;
+
+    if (ne1 == 1) {
+        cl_mem qs_img = nullptr;
+        cl_mem b_sub_buf = nullptr;
+        cl_mem b_img = nullptr;
+
+        // image for qs
+        img_fmt = { CL_R, CL_UNSIGNED_INT32 };
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = M * K / 2 / 4;
+        img_desc.buffer = extra0_q5_0->qs;
+        CL_CHECK((qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        // subbuffer for activations
+        region.origin = offset1;
+        region.size = K * N * sizeof(float);
+        CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for activations
+        img_fmt = {CL_RGBA, CL_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * N / 4;
+        img_desc.buffer = b_sub_buf;
+        CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        kernel = backend_ctx->kernel_gemv_noshuffle_q5_0_f32;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &qs_img));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),   &extra0_q5_0->qh));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra0_q5_0->d));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem),   &b_img));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int),   &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int),   &ne01));
+
+        size_t local_work_size[3] = {64, 4, 1};
+        size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+        CL_CHECK(clReleaseMemObject(qs_img));
+        CL_CHECK(clReleaseMemObject(b_sub_buf));
+        CL_CHECK(clReleaseMemObject(b_img));
+    } else {
+        cl_mem b_sub_buf = nullptr;
+        cl_mem b_sub_buf_trans = nullptr;
+        cl_mem b_img = nullptr;
+        cl_mem b_img_trans = nullptr;
+        cl_mem d_sub_buf = nullptr;
+
+        // subbuffer for activations
+        region.origin = offset1;
+        region.size = K * N * sizeof(float);
+        CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for activations
+        img_fmt = {CL_RGBA, CL_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * N / 4;
+        img_desc.buffer = b_sub_buf;
+        CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        // pad N to multiple of 8
+        int extra_elements = N % 8;
+        int padding = 0;
+        if (extra_elements > 0){
+            padding = 8 - extra_elements;
+        }
+
+        // subbuffer for transposed activations
+        region.origin = 0;
+        region.size = K * (N + padding) * sizeof(float)/2;
+        backend_ctx->prealloc_act_trans.allocate(context, region.size);
+        CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for transposed activations
+        img_fmt = {CL_RGBA, CL_HALF_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * (N + padding) / 4;
+        img_desc.buffer = b_sub_buf_trans;
+        CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
+
+        // subbuffer for output
+        region.origin = extrad->offset;
+        region.size = M * N * sizeof(float);
+        CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // transpose activations
+        int height_B = N/4;
+        if (height_B == 0) {
+            height_B = 1;
+        }
+        int width_B = K/4;
+        int padded_height_B = (N + padding)/4;
+
+        kernel = backend_ctx->kernel_transpose_32_16;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
+
+        size_t local_work_size_t[2] = { 1, 16 };
+        size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
+        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+
+        // gemm
+        kernel = backend_ctx->kernel_gemm_noshuffle_q5_0_f32;
+        int padded_N = N + padding;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0_q5_0->qs));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),   &extra0_q5_0->qh));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra0_q5_0->d));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem),   &b_img_trans));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &d_sub_buf));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int),   &ne01));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int),   &padded_N));
+        CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int),   &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int),   &ne1));
+
+        size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
+        size_t local_work_size[3] = {1, 128, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+        CL_CHECK(clReleaseMemObject(b_sub_buf));
+        CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
+        CL_CHECK(clReleaseMemObject(b_img));
+        CL_CHECK(clReleaseMemObject(b_img_trans));
+        CL_CHECK(clReleaseMemObject(d_sub_buf));
+    }
+#else
+    GGML_UNUSED(backend);
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+#endif
+}
+
+static void ggml_cl_mul_mat_q5_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+    ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra;
+
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+
+    const int ne1 = dst->ne[1];
+
+    GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+
+    cl_context context = backend_ctx->context;
+    cl_kernel kernel;
+
+    cl_int              err;
+    cl_image_format     img_fmt;
+    cl_image_desc       img_desc;
+    cl_buffer_region    region;
+
+    int M = ne01;
+    int N = ne1;
+    int K = ne00;
+
+    if (ne1 == 1) {
+        cl_mem qs_img = nullptr;
+        cl_mem b_sub_buf = nullptr;
+        cl_mem b_img = nullptr;
+
+        // image for qs
+        img_fmt = { CL_R, CL_UNSIGNED_INT32 };
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = M * K / 2 / 4;
+        img_desc.buffer = extra0_q5_1->qs;
+        CL_CHECK((qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        // subbuffer for activations
+        region.origin = offset1;
+        region.size = K * N * sizeof(float);
+        CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for activations
+        img_fmt = {CL_RGBA, CL_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * N / 4;
+        img_desc.buffer = b_sub_buf;
+        CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        kernel = backend_ctx->kernel_gemv_noshuffle_q5_1_f32;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &qs_img));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),   &extra0_q5_1->qh));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra0_q5_1->d));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem),   &extra0_q5_1->m));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &b_img));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int),   &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int),   &ne01));
+
+        size_t local_work_size[3] = {64, 4, 1};
+        size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+        CL_CHECK(clReleaseMemObject(qs_img));
+        CL_CHECK(clReleaseMemObject(b_sub_buf));
+        CL_CHECK(clReleaseMemObject(b_img));
+    } else {
+        cl_mem b_sub_buf = nullptr;
+        cl_mem b_sub_buf_trans = nullptr;
+        cl_mem b_img = nullptr;
+        cl_mem b_img_trans = nullptr;
+        cl_mem d_sub_buf = nullptr;
+
+        // subbuffer for activations
+        region.origin = offset1;
+        region.size = K * N * sizeof(float);
+        CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for activations
+        img_fmt = {CL_RGBA, CL_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * N / 4;
+        img_desc.buffer = b_sub_buf;
+        CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        // pad N to multiple of 8
+        int extra_elements = N % 8;
+        int padding = 0;
+        if (extra_elements > 0){
+            padding = 8 - extra_elements;
+        }
+
+        // subbuffer for transposed activations
+        region.origin = 0;
+        region.size = K * (N + padding) * sizeof(float)/2;
+        backend_ctx->prealloc_act_trans.allocate(context, region.size);
+        CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for transposed activations
+        img_fmt = {CL_RGBA, CL_HALF_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * (N + padding) / 4;
+        img_desc.buffer = b_sub_buf_trans;
+        CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
+
+        // subbuffer for output
+        region.origin = extrad->offset;
+        region.size = M * N * sizeof(float);
+        CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // transpose activations
+        int height_B = N/4;
+        if (height_B == 0) {
+            height_B = 1;
+        }
+        int width_B = K/4;
+        int padded_height_B = (N + padding)/4;
+
+        kernel = backend_ctx->kernel_transpose_32_16;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
+
+        size_t local_work_size_t[2] = { 1, 16 };
+        size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
+        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+
+        // gemm
+        kernel = backend_ctx->kernel_gemm_noshuffle_q5_1_f32;
+        int padded_N = N + padding;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0_q5_1->qs));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),   &extra0_q5_1->qh));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra0_q5_1->d));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem),   &extra0_q5_1->m));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &b_img_trans));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem),   &d_sub_buf));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int),   &ne01));
+        CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int),   &padded_N));
+        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int),   &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int),   &ne1));
+
+        size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
+        size_t local_work_size[3] = {1, 128, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+        CL_CHECK(clReleaseMemObject(b_sub_buf));
+        CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
+        CL_CHECK(clReleaseMemObject(b_img));
+        CL_CHECK(clReleaseMemObject(b_img_trans));
+        CL_CHECK(clReleaseMemObject(d_sub_buf));
+    }
+#else
+    GGML_UNUSED(backend);
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+#endif
+}
+
 static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
    GGML_ASSERT(src0);
@@ -13214,6 +13850,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
            return;
        }

+        // q5_0 x fp32
+        if (src0t == GGML_TYPE_Q5_0 && src1t == GGML_TYPE_F32) {
+            ggml_cl_mul_mat_q5_0_f32_adreno(backend, src0, src1, dst);
+            return;
+        }
+
+        // q5_1 x fp32
+        if (src0t == GGML_TYPE_Q5_1 && src1t == GGML_TYPE_F32) {
+            ggml_cl_mul_mat_q5_1_f32_adreno(backend, src0, src1, dst);
+            return;
+        }
+
        // iq4_nl x fp32
        if (src0t == GGML_TYPE_IQ4_NL && src1t == GGML_TYPE_F32) {
            ggml_cl_mul_mat_iq4_nl_f32_adreno(backend, src0, src1, dst);
@@ -14536,7 +15184,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
            } else if (backend_ctx->gpu_family == ADRENO) {
                nth0 = 64;
                nth1 = 2;
-                ndst = 4;
+                ndst = 16;
            } else {
                GGML_ASSERT(false && "TODO: Unknown GPU");
            }
@@ -16633,7 +17281,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
                    kernel = backend_ctx->kernel_cpy_f32_f16;
                    break;
                case GGML_TYPE_F32:
-                    kernel = backend_ctx->kernel_cpy_f32_f32;
+                    kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
+                                       : backend_ctx->kernel_cpy_f32_f32;
                    break;
                default:
                    GGML_ASSERT(false && "not implemented");
@@ -16685,12 +17334,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));

-    const int nth = MIN(64, ne00);
+    if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
+        const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
+        const int base  = MIN(64, maxwg);
+        const int tpr   = MIN(ne00, base);                 // threads per row
+        const int rpw   = MAX(1, base / tpr);              // rows per workgroup
+        const int lsz   = tpr * rpw;                       // <= base <= maxwg
+        const int nrows = ne01*ne02*ne03;
+        const int nwg   = (nrows + rpw - 1) / rpw;

-    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
-    size_t local_work_size[] = {(size_t)nth, 1, 1};
+        size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
+        size_t local_work_size[]  = {(size_t)lsz, 1, 1};

-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
+        backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
+    } else {
+        const int nth = MIN(64, ne00);
+
+        size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
+    }
 }

 static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -17705,7 +18369,7 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, ggml_tensor * dst) {
    const cl_uint H_v      = (cl_uint) src_v->ne[1];
    const cl_uint n_tokens = (cl_uint) src_v->ne[2];
    const cl_uint n_seqs   = (cl_uint) src_v->ne[3];
-    const cl_uint K        = (cl_uint) src_state->ne[1];
+    const cl_uint K        = (cl_uint) ggml_get_op_params_i32(dst, 0);

    int si;
    switch (S_v) {
@@ -49,3 +49,70 @@ kernel void kernel_concat_f32(
        *y = *x;
    }
 }
+
+kernel void kernel_concat_f32_pack(
+    global  const char * src0,
+    ulong                offset0,
+    global  const char * src1,
+    ulong                offset1,
+    global        char * dst,
+    ulong                offsetd,
+    int             ne00,
+    int             ne01,
+    int             ne02,
+    int             ne03,
+    ulong           nb00,
+    ulong           nb01,
+    ulong           nb02,
+    ulong           nb03,
+    ulong           nb10,
+    ulong           nb11,
+    ulong           nb12,
+    ulong           nb13,
+    int             ne0,
+    ulong           nb0,
+    ulong           nb1,
+    ulong           nb2,
+    ulong           nb3,
+    int             dim,
+    int             ne1,
+    int             ne2,
+    int             ne3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst  + offsetd;
+
+    int lsz = get_local_size(0);
+    int tpr = min(ne0, lsz);          // threads per row
+    int rpw = lsz / tpr;              // rows per workgroup
+    int lid = get_local_id(0);
+    int row = get_group_id(0)*rpw + lid / tpr;
+    int lane = lid - (lid / tpr) * tpr;
+
+    int nrows = ne1*ne2*ne3;
+    if (row >= nrows) {
+        return;
+    }
+
+    int i1 = row % ne1;
+    int t  = row / ne1;
+    int i2 = t % ne2;
+    int i3 = t / ne2;
+
+    int o[4] = {0, 0, 0, 0};
+    o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
+
+    for (int i0 = lane; i0 < ne0; i0 += tpr) {
+        global const float * x;
+        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+            x = (global const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
+        } else {
+            x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
+        }
+
+        global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+        *y = *x;
+    }
+}
@@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32(
    }
 }

+kernel void kernel_cpy_f32_f32_pack(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int lsz = get_local_size(0);
+    int tpr = min(ne00, lsz);          // threads per row
+    int rpw = lsz / tpr;               // rows per workgroup
+    int lid = get_local_id(0);
+    int row = get_group_id(0)*rpw + lid / tpr;
+    int lane = lid - (lid / tpr) * tpr;
+
+    int nrows = ne01*ne02*ne03;
+    if (row >= nrows) {
+        return;
+    }
+
+    int i01 = row % ne01;
+    int t   = row / ne01;
+    int i02 = t % ne02;
+    int i03 = t / ne02;
+
+    // linear index of the first element of this row, unflattened over dst dims
+    long n  = (long)row * ne00;
+    int i3  = (int)(n / ((long)ne2*ne1*ne0));
+    long rm = n - (long)i3*ne2*ne1*ne0;
+    int i2  = (int)(rm / ((long)ne1*ne0));
+    rm     -= (long)i2*ne1*ne0;
+    int i1  = (int)(rm / ne0);
+    int i0  = (int)(rm - (long)i1*ne0);
+
+    global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = lane; i00 < ne00; i00 += tpr) {
+        global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        dst_data[i00] = src[0];
+    }
+}
+
 kernel void kernel_cpy_i32_i32(
        global int * src0,
        ulong offset0,
@@ -584,6 +584,60 @@ kernel void kernel_restore_block_q5_0(
    }
 }

+kernel void kernel_convert_block_q5_0_noshuffle(
+    global struct block_q5_0 * src0,
+    global uchar * dst_q,
+    global uint  * dst_qh,
+    global half  * dst_d
+) {
+    global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
+    global uchar * q  = (global uchar *) dst_q + QK5_0/2*get_global_id(0);
+    global uint  * qh = (global uint  *) dst_qh + get_global_id(0);
+    global half  * d  = (global half  *) dst_d  + get_global_id(0);
+
+    *d = b->d;
+    *qh = *((global uint *)(b->qh));
+
+    for (int i = 0; i < QK5_0/4; ++i) {
+        uchar x0 = b->qs[2*i + 0];
+        uchar x1 = b->qs[2*i + 1];
+
+        q[i + 0      ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        q[i + QK5_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+
+#ifdef ADRENO_GPU
+        if (get_global_id(0) == 65536*4096) {
+            printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
+        }
+#endif
+    }
+}
+
+kernel void kernel_restore_block_q5_0_noshuffle(
+    global uchar * src_q,
+    global uint  * src_qh,
+    global half  * src_d,
+    global struct block_q5_0 * dst,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
+    global uchar * q  = (global uchar *) src_q + QK5_0/2*get_global_id(0);
+    global uint  * qh = (global uint  *) src_qh + get_global_id(0);
+    global half  * d  = (global half  *) src_d  + get_global_id(0);
+
+    b->d = *d;
+    *((global uint *)(b->qh)) = *qh;
+
+    for (int i = 0; i < QK5_0/4; ++i) {
+        uchar x0 = q[i + 0      ];
+        uchar x1 = q[i + QK5_0/4];
+
+        b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
+        b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
+    }
+}
+
 kernel void kernel_convert_block_q5_0_trans4_ns(
    __global struct block_q5_0 * src0,
    __global uint * dst_qs,
@@ -736,6 +790,66 @@ kernel void kernel_restore_block_q5_1(
    }
 }

+kernel void kernel_convert_block_q5_1_noshuffle(
+    global struct block_q5_1 * src0,
+    global uchar * dst_q,
+    global uint  * dst_qh,
+    global half  * dst_d,
+    global half  * dst_m
+) {
+    global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
+    global uchar * q  = (global uchar *) dst_q + QK5_1/2*get_global_id(0);
+    global uint  * qh = (global uint  *) dst_qh + get_global_id(0);
+    global half  * d  = (global half  *) dst_d  + get_global_id(0);
+    global half  * m  = (global half  *) dst_m  + get_global_id(0);
+
+    *d = b->d;
+    *m = b->m;
+    *qh = *((global uint *)(b->qh));
+
+    for (int i = 0; i < QK5_1/4; ++i) {
+        uchar x0 = b->qs[2*i + 0];
+        uchar x1 = b->qs[2*i + 1];
+
+        q[i + 0      ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        q[i + QK5_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+
+#ifdef ADRENO_GPU
+        if (get_global_id(0) == 65536*4096) {
+            printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
+        }
+#endif
+    }
+}
+
+kernel void kernel_restore_block_q5_1_noshuffle(
+    global uchar * src_q,
+    global uint  * src_qh,
+    global half  * src_d,
+    global half  * src_m,
+    global struct block_q5_1 * dst,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
+    global uchar * q  = (global uchar *) src_q + QK5_1/2*get_global_id(0);
+    global uint  * qh = (global uint  *) src_qh + get_global_id(0);
+    global half  * d  = (global half  *) src_d  + get_global_id(0);
+    global half  * m  = (global half  *) src_m  + get_global_id(0);
+
+    b->d = *d;
+    b->m = *m;
+    *((global uint *)(b->qh)) = *qh;
+
+    for (int i = 0; i < QK5_1/4; ++i) {
+        uchar x0 = q[i + 0      ];
+        uchar x1 = q[i + QK5_1/4];
+
+        b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
+        b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
+    }
+}
+
 kernel void kernel_convert_block_q5_1_trans4_ns(
    __global struct block_q5_1 * src0,
    __global uint * dst_qs,
@@ -123,7 +123,8 @@ kernel void kernel_gated_delta_net(
    const uint iq3 = seq_id / rq3; // seq index for Q and K

    const uint state_size = S_V * S_V;
-    const uint state_base = (seq_id * K * H_v + head_id) * state_size;
+    // input state holds s0 only [S_v, S_v, H, n_seqs]: per-seq stride is H*D.
+    const uint state_base = (seq_id * H_v + head_id) * state_size;
    const uint q_off_base  = iq3 * sq3 + iq1 * sq1;
    const uint v_off_base  = seq_id * sv3 + head_id * sv1;
    const uint gb_off_base = seq_id * sb3 + head_id * sb1;
@@ -143,7 +144,8 @@ kernel void kernel_gated_delta_net(
        }
    }

-    const int shift = (int)n_tokens - (int)K;
+    // snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
+    // When n_tokens < K only slots 0..n_tokens-1 are written; older slots are caller-owned.
    uint attn_off = (seq_id * n_tokens * H_v + head_id) * S_V;

    for (uint t = 0; t < n_tokens; t++) {
@@ -219,7 +221,7 @@ kernel void kernel_gated_delta_net(
        attn_off += S_V * H_v;

        if (K > 1u) {
-            const int target_slot = (int)t - shift;
+            const int target_slot = (int)n_tokens - 1 - (int)t;
            if (target_slot >= 0 && target_slot < (int)K) {
                #pragma unroll
                for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
@@ -0,0 +1,131 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_128
+#endif
+
+kernel void kernel_gemm_noshuffle_q5_0_f32(
+        global const ushort * src0_qs,      // quantized A
+        global const uchar  * src0_qh,      // 5th bits
+        global const half   * src0_d,       // A scales
+        __read_only image1d_buffer_t src1,  // B (1d image)
+        global float * dst,                 // C
+        int m,                              // M
+        int n,                              // N with padding
+        int k,                              // K
+        int n_no_padding                    // N without padding
+) {
+
+    int n_4 = n >> 2;
+
+    int gy = get_global_id(0);
+    int gx = get_global_id(1);
+    int gx_2 = gx << 2;
+
+    half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
+    half8 B;
+    half4 dequantized_weights;
+
+    global const ushort * weight_ptr = src0_qs + gx_2;
+    global const uchar  * qh_ptr    = src0_qh + gx_2;
+    global const half   * scale_ptr = src0_d  + gx_2;
+
+    for (int i = 0; i < k; i += 4) {
+
+        B.s0123 = read_imageh(src1, gy*2 + i*n_4);
+        B.s4567 = read_imageh(src1, gy*2 + i*n_4 + 1);
+
+        ushort4 bits4 = vload4(0, weight_ptr + (i >> 2)*m);
+        uchar4  bits1 = vload4(0, qh_ptr + (i >> 3)*m);
+        uchar4  qh = bits1 >> (uchar4)(i & 4);
+
+        half4 scale = vload4(0, scale_ptr + (i >> 5)*m);
+
+        // j=0
+        dequantized_weights.s0 = (convert_half((bits4.s0 & 0x000F) | ((qh.s0 & 0x01) << 4)) - 16.0h) * scale.s0;
+        dequantized_weights.s1 = (convert_half((bits4.s1 & 0x000F) | ((qh.s1 & 0x01) << 4)) - 16.0h) * scale.s1;
+        dequantized_weights.s2 = (convert_half((bits4.s2 & 0x000F) | ((qh.s2 & 0x01) << 4)) - 16.0h) * scale.s2;
+        dequantized_weights.s3 = (convert_half((bits4.s3 & 0x000F) | ((qh.s3 & 0x01) << 4)) - 16.0h) * scale.s3;
+        c0 += B * dequantized_weights.s0;
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=1
+        B.s0123 = read_imageh(src1, gy*2 + (i+1)*n_4);
+        B.s4567 = read_imageh(src1, gy*2 + (i+1)*n_4 + 1);
+        dequantized_weights.s0 = (convert_half(((bits4.s0 & 0x00F0) >> 4) | ((qh.s0 & 0x02) << 3)) - 16.0h) * scale.s0;
+        dequantized_weights.s1 = (convert_half(((bits4.s1 & 0x00F0) >> 4) | ((qh.s1 & 0x02) << 3)) - 16.0h) * scale.s1;
+        dequantized_weights.s2 = (convert_half(((bits4.s2 & 0x00F0) >> 4) | ((qh.s2 & 0x02) << 3)) - 16.0h) * scale.s2;
+        dequantized_weights.s3 = (convert_half(((bits4.s3 & 0x00F0) >> 4) | ((qh.s3 & 0x02) << 3)) - 16.0h) * scale.s3;
+        c0 += B * dequantized_weights.s0;
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=2
+        B.s0123 = read_imageh(src1, gy*2 + (i+2)*n_4);
+        B.s4567 = read_imageh(src1, gy*2 + (i+2)*n_4 + 1);
+        dequantized_weights.s0 = (convert_half(((bits4.s0 & 0x0F00) >> 8) | ((qh.s0 & 0x04) << 2)) - 16.0h) * scale.s0;
+        dequantized_weights.s1 = (convert_half(((bits4.s1 & 0x0F00) >> 8) | ((qh.s1 & 0x04) << 2)) - 16.0h) * scale.s1;
+        dequantized_weights.s2 = (convert_half(((bits4.s2 & 0x0F00) >> 8) | ((qh.s2 & 0x04) << 2)) - 16.0h) * scale.s2;
+        dequantized_weights.s3 = (convert_half(((bits4.s3 & 0x0F00) >> 8) | ((qh.s3 & 0x04) << 2)) - 16.0h) * scale.s3;
+        c0 += B * dequantized_weights.s0;
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=3
+        B.s0123 = read_imageh(src1, gy*2 + (i+3)*n_4);
+        B.s4567 = read_imageh(src1, gy*2 + (i+3)*n_4 + 1);
+        dequantized_weights.s0 = (convert_half(((bits4.s0 & 0xF000) >> 12) | ((qh.s0 & 0x08) << 1)) - 16.0h) * scale.s0;
+        dequantized_weights.s1 = (convert_half(((bits4.s1 & 0xF000) >> 12) | ((qh.s1 & 0x08) << 1)) - 16.0h) * scale.s1;
+        dequantized_weights.s2 = (convert_half(((bits4.s2 & 0xF000) >> 12) | ((qh.s2 & 0x08) << 1)) - 16.0h) * scale.s2;
+        dequantized_weights.s3 = (convert_half(((bits4.s3 & 0xF000) >> 12) | ((qh.s3 & 0x08) << 1)) - 16.0h) * scale.s3;
+        c0 += B * dequantized_weights.s0;
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+    }
+
+    int idx = (gy<<3)*m + (gx<<2);
+
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
+    }
+}
@@ -0,0 +1,134 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_128
+#endif
+
+kernel void kernel_gemm_noshuffle_q5_1_f32(
+        global const ushort * src0_qs,      // quantized A
+        global const uchar  * src0_qh,      // 5th bits
+        global const half   * src0_d,       // A scales
+        global const half   * src0_m,       // A mins
+        __read_only image1d_buffer_t src1,  // B (1d image)
+        global float * dst,                 // C
+        int m,                              // M
+        int n,                              // N with padding
+        int k,                              // K
+        int n_no_padding                    // N without padding
+) {
+
+    int n_4 = n >> 2;
+
+    int gy = get_global_id(0);
+    int gx = get_global_id(1);
+    int gx_2 = gx << 2;
+
+    half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
+    half8 B;
+    half4 dequantized_weights;
+
+    global const ushort * weight_ptr = src0_qs + gx_2;
+    global const uchar  * qh_ptr    = src0_qh + gx_2;
+    global const half   * scale_ptr = src0_d  + gx_2;
+    global const half   * min_ptr   = src0_m  + gx_2;
+
+    for (int i = 0; i < k; i += 4) {
+
+        B.s0123 = read_imageh(src1, gy*2 + i*n_4);
+        B.s4567 = read_imageh(src1, gy*2 + i*n_4 + 1);
+
+        ushort4 bits4 = vload4(0, weight_ptr + (i >> 2)*m);
+        uchar4  bits1 = vload4(0, qh_ptr + (i >> 3)*m);
+        uchar4  qh = bits1 >> (uchar4)(i & 4);
+
+        half4 scale = vload4(0, scale_ptr + (i >> 5)*m);
+        half4 minv  = vload4(0, min_ptr   + (i >> 5)*m);
+
+        // j=0
+        dequantized_weights.s0 = convert_half((bits4.s0 & 0x000F) | ((qh.s0 & 0x01) << 4)) * scale.s0 + minv.s0;
+        dequantized_weights.s1 = convert_half((bits4.s1 & 0x000F) | ((qh.s1 & 0x01) << 4)) * scale.s1 + minv.s1;
+        dequantized_weights.s2 = convert_half((bits4.s2 & 0x000F) | ((qh.s2 & 0x01) << 4)) * scale.s2 + minv.s2;
+        dequantized_weights.s3 = convert_half((bits4.s3 & 0x000F) | ((qh.s3 & 0x01) << 4)) * scale.s3 + minv.s3;
+        c0 += B * dequantized_weights.s0;
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=1
+        B.s0123 = read_imageh(src1, gy*2 + (i+1)*n_4);
+        B.s4567 = read_imageh(src1, gy*2 + (i+1)*n_4 + 1);
+        dequantized_weights.s0 = convert_half(((bits4.s0 & 0x00F0) >> 4) | ((qh.s0 & 0x02) << 3)) * scale.s0 + minv.s0;
+        dequantized_weights.s1 = convert_half(((bits4.s1 & 0x00F0) >> 4) | ((qh.s1 & 0x02) << 3)) * scale.s1 + minv.s1;
+        dequantized_weights.s2 = convert_half(((bits4.s2 & 0x00F0) >> 4) | ((qh.s2 & 0x02) << 3)) * scale.s2 + minv.s2;
+        dequantized_weights.s3 = convert_half(((bits4.s3 & 0x00F0) >> 4) | ((qh.s3 & 0x02) << 3)) * scale.s3 + minv.s3;
+        c0 += B * dequantized_weights.s0;
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=2
+        B.s0123 = read_imageh(src1, gy*2 + (i+2)*n_4);
+        B.s4567 = read_imageh(src1, gy*2 + (i+2)*n_4 + 1);
+        dequantized_weights.s0 = convert_half(((bits4.s0 & 0x0F00) >> 8) | ((qh.s0 & 0x04) << 2)) * scale.s0 + minv.s0;
+        dequantized_weights.s1 = convert_half(((bits4.s1 & 0x0F00) >> 8) | ((qh.s1 & 0x04) << 2)) * scale.s1 + minv.s1;
+        dequantized_weights.s2 = convert_half(((bits4.s2 & 0x0F00) >> 8) | ((qh.s2 & 0x04) << 2)) * scale.s2 + minv.s2;
+        dequantized_weights.s3 = convert_half(((bits4.s3 & 0x0F00) >> 8) | ((qh.s3 & 0x04) << 2)) * scale.s3 + minv.s3;
+        c0 += B * dequantized_weights.s0;
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=3
+        B.s0123 = read_imageh(src1, gy*2 + (i+3)*n_4);
+        B.s4567 = read_imageh(src1, gy*2 + (i+3)*n_4 + 1);
+        dequantized_weights.s0 = convert_half(((bits4.s0 & 0xF000) >> 12) | ((qh.s0 & 0x08) << 1)) * scale.s0 + minv.s0;
+        dequantized_weights.s1 = convert_half(((bits4.s1 & 0xF000) >> 12) | ((qh.s1 & 0x08) << 1)) * scale.s1 + minv.s1;
+        dequantized_weights.s2 = convert_half(((bits4.s2 & 0xF000) >> 12) | ((qh.s2 & 0x08) << 1)) * scale.s2 + minv.s2;
+        dequantized_weights.s3 = convert_half(((bits4.s3 & 0xF000) >> 12) | ((qh.s3 & 0x08) << 1)) * scale.s3 + minv.s3;
+        c0 += B * dequantized_weights.s0;
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+    }
+
+    int idx = (gy<<3)*m + (gx<<2);
+
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
+    }
+}
@@ -0,0 +1,291 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#endif
+
+#define QK5_0 32
+#define NSUBGROUPS 4
+#define SUBGROUP_SIZE 64
+
+#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, y) \
+    float shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0      ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4      ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 0); \
+    total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 0); \
+    total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 0); \
+    total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 0); \
+    total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 0); \
+    total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 0); \
+    total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1      ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5      ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 1); \
+    total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 1); \
+    total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 1); \
+    total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 1); \
+    total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 1); \
+    total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 1); \
+    total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, y) \
+    shared_y = sub_group_broadcast(y.s0, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2      ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6      ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 2); \
+    total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 2); \
+    total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 2); \
+    total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 2); \
+    total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 2); \
+    total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 2); \
+    total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3      ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7      ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 3); \
+    total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 3); \
+    total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 3); \
+    total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 3); \
+    total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 3); \
+    total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 3); \
+    total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, y) \
+    float8 shared_y; \
+    shared_y = sub_group_broadcast(y, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x000F)         | (((bits1.s0     ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4)  | (((bits1.s0 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8)  | (((bits1.s0 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += (((bits4.s2 & 0x000F)         | (((bits1.s0 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4)  | (((bits1.s0 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8)  | (((bits1.s0 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += (((bits4.s1 & 0x000F)         | (((bits1.s4     ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4)  | (((bits1.s4 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8)  | (((bits1.s4 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += (((bits4.s3 & 0x000F)         | (((bits1.s4 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4)  | (((bits1.s4 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8)  | (((bits1.s4 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x000F)         | (((bits1.s1     ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4)  | (((bits1.s1 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8)  | (((bits1.s1 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += (((bits4.s6 & 0x000F)         | (((bits1.s1 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4)  | (((bits1.s1 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8)  | (((bits1.s1 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += (((bits4.s5 & 0x000F)         | (((bits1.s5     ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4)  | (((bits1.s5 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8)  | (((bits1.s5 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += (((bits4.s7 & 0x000F)         | (((bits1.s5 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4)  | (((bits1.s5 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8)  | (((bits1.s5 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
+
+
+#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, y) \
+    shared_y = sub_group_broadcast(y, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x000F)         | (((bits1.s2     ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4)  | (((bits1.s2 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8)  | (((bits1.s2 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += (((bits4.s2 & 0x000F)         | (((bits1.s2 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4)  | (((bits1.s2 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8)  | (((bits1.s2 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += (((bits4.s1 & 0x000F)         | (((bits1.s6     ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4)  | (((bits1.s6 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8)  | (((bits1.s6 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += (((bits4.s3 & 0x000F)         | (((bits1.s6 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4)  | (((bits1.s6 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8)  | (((bits1.s6 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x000F)         | (((bits1.s3     ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4)  | (((bits1.s3 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8)  | (((bits1.s3 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += (((bits4.s6 & 0x000F)         | (((bits1.s3 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4)  | (((bits1.s3 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8)  | (((bits1.s3 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += (((bits4.s5 & 0x000F)         | (((bits1.s7     ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4)  | (((bits1.s7 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8)  | (((bits1.s7 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += (((bits4.s7 & 0x000F)         | (((bits1.s7 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4)  | (((bits1.s7 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8)  | (((bits1.s7 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+__kernel void kernel_gemv_noshuffle_q5_0_f32(
+        __read_only  image1d_buffer_t src0_qs,  // quantized A
+        global ushort * src0_qh,                 // 5th bits
+        global half2  * src0_d,                  // A scales
+        __read_only  image1d_buffer_t src1,      // B activations
+        global float * dst,
+        ulong offsetd,
+        int ne00,               // K
+        int ne01)               // M
+{
+    uint groupId = get_local_id(1);
+    uint gid     = get_global_id(0);
+    ushort slid  = get_sub_group_local_id();
+
+    uint K = ne00;
+    uint M = ne01;
+
+    uint LINE_STRIDE_A    = M / 2;
+    uint BLOCK_STRIDE_A  = NSUBGROUPS * M;
+
+    private uint4     regA;
+    private half2     regS;
+    private float8    regB;
+
+    private float2 totalSum = (float2)(0.0f);
+
+    for (uint k = groupId; k < (K / QK5_0); k += NSUBGROUPS) {
+        regS = src0_d[gid + k * LINE_STRIDE_A];
+
+        ushort4 qh_raw;
+        qh_raw.s0 = src0_qh[gid + (4*k + 0) * LINE_STRIDE_A];
+        qh_raw.s1 = src0_qh[gid + (4*k + 1) * LINE_STRIDE_A];
+        qh_raw.s2 = src0_qh[gid + (4*k + 2) * LINE_STRIDE_A];
+        qh_raw.s3 = src0_qh[gid + (4*k + 3) * LINE_STRIDE_A];
+
+        uchar8 raw = as_uchar8(qh_raw);
+        uchar8 qh_bytes = (uchar8)(raw.s0, raw.s2, raw.s4, raw.s6,
+                                    raw.s1, raw.s3, raw.s5, raw.s7);
+
+        // Load activations
+        if (slid < 4) {
+            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+        }
+
+        regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+        regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+        regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+        regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+
+#ifdef VECTOR_SUB_GROUP_BROADCAST
+        dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
+#else
+        dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAST
+
+        regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+        regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+        regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+        regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAST
+        dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
+#else
+        dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAST
+    }
+
+    // reduction in local memory, assumes #wave=4
+    local float2 reduceLM[SUBGROUP_SIZE * 3];
+    if (groupId == 1) {
+        reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
+    }
+    if (groupId == 2) {
+        reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
+    }
+    if (groupId == 3) {
+        reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (groupId == 0) {
+        totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
+    }
+    if (groupId == 0) {
+        totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
+    }
+    if (groupId == 0) {
+        totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
+    }
+
+    // 2 outputs per fiber in wave 0
+    if (groupId == 0) {
+        dst = (global float*)((global char*)dst + offsetd);
+        vstore2(totalSum, 0, &(dst[gid * 2]));
+    }
+
+}
@@ -0,0 +1,294 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#endif
+
+#define QK5_1 32
+#define NSUBGROUPS 4
+#define SUBGROUP_SIZE 64
+
+#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, minv, y) \
+    float shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0      ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4      ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 0); \
+    total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 0); \
+    total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 0); \
+    total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 0); \
+    total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 0); \
+    total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 0); \
+    total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1      ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5      ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 1); \
+    total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 1); \
+    total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 1); \
+    total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 1); \
+    total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 1); \
+    total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 1); \
+    total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, minv, y) \
+    shared_y = sub_group_broadcast(y.s0, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2      ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6      ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 2); \
+    total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 2); \
+    total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 2); \
+    total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 2); \
+    total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 2); \
+    total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 2); \
+    total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3      ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7      ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 3); \
+    total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 3); \
+    total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 3); \
+    total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 3); \
+    total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 3); \
+    total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 3); \
+    total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
+    total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, minv, y) \
+    float8 shared_y; \
+    shared_y = sub_group_broadcast(y, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x000F)         | (((bits1.s0     ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
+    total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4)  | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
+    total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8)  | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
+    total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
+    total_sums.s0 += (((bits4.s2 & 0x000F)         | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
+    total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4)  | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
+    total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8)  | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
+    total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
+    total_sums.s1 += (((bits4.s1 & 0x000F)         | (((bits1.s4     ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
+    total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4)  | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
+    total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8)  | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
+    total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
+    total_sums.s1 += (((bits4.s3 & 0x000F)         | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
+    total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4)  | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
+    total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8)  | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
+    total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x000F)         | (((bits1.s1     ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
+    total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4)  | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
+    total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8)  | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
+    total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
+    total_sums.s0 += (((bits4.s6 & 0x000F)         | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
+    total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4)  | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
+    total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8)  | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
+    total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
+    total_sums.s1 += (((bits4.s5 & 0x000F)         | (((bits1.s5     ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
+    total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4)  | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
+    total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8)  | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
+    total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
+    total_sums.s1 += (((bits4.s7 & 0x000F)         | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
+    total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4)  | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
+    total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8)  | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
+    total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
+
+
+#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, minv, y) \
+    shared_y = sub_group_broadcast(y, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x000F)         | (((bits1.s2     ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
+    total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4)  | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
+    total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8)  | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
+    total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
+    total_sums.s0 += (((bits4.s2 & 0x000F)         | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
+    total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4)  | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
+    total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8)  | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
+    total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
+    total_sums.s1 += (((bits4.s1 & 0x000F)         | (((bits1.s6     ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
+    total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4)  | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
+    total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8)  | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
+    total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
+    total_sums.s1 += (((bits4.s3 & 0x000F)         | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
+    total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4)  | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
+    total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8)  | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
+    total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x000F)         | (((bits1.s3     ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
+    total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4)  | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
+    total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8)  | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
+    total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
+    total_sums.s0 += (((bits4.s6 & 0x000F)         | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
+    total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4)  | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
+    total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8)  | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
+    total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
+    total_sums.s1 += (((bits4.s5 & 0x000F)         | (((bits1.s7     ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
+    total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4)  | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
+    total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8)  | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
+    total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
+    total_sums.s1 += (((bits4.s7 & 0x000F)         | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
+    total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4)  | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
+    total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8)  | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
+    total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+__kernel void kernel_gemv_noshuffle_q5_1_f32(
+        __read_only  image1d_buffer_t src0_qs,  // quantized A
+        global ushort * src0_qh,                 // 5th bits
+        global half2  * src0_d,                  // A scales
+        global half2  * src0_m,                  // A mins
+        __read_only  image1d_buffer_t src1,      // B activations
+        global float * dst,
+        ulong offsetd,
+        int ne00,               // K
+        int ne01)               // M
+{
+    uint groupId = get_local_id(1);
+    uint gid     = get_global_id(0);
+    ushort slid  = get_sub_group_local_id();
+
+    uint K = ne00;
+    uint M = ne01;
+
+    uint LINE_STRIDE_A    = M / 2;
+   uint BLOCK_STRIDE_A  = NSUBGROUPS * M;
+
+    __private uint4     regA;
+    __private half2     regS;
+    __private half2     regM;
+    __private float8    regB;
+
+    __private float2 totalSum = (float2)(0.0f);
+
+    for (uint k = groupId; k < (K / QK5_1); k += NSUBGROUPS) {
+        regS = src0_d[gid + k * LINE_STRIDE_A];
+        regM = src0_m[gid + k * LINE_STRIDE_A];
+
+        ushort4 qh_raw;
+        qh_raw.s0 = src0_qh[gid + (4*k + 0) * LINE_STRIDE_A];
+        qh_raw.s1 = src0_qh[gid + (4*k + 1) * LINE_STRIDE_A];
+        qh_raw.s2 = src0_qh[gid + (4*k + 2) * LINE_STRIDE_A];
+        qh_raw.s3 = src0_qh[gid + (4*k + 3) * LINE_STRIDE_A];
+
+        uchar8 raw = as_uchar8(qh_raw);
+        uchar8 qh_bytes = (uchar8)(raw.s0, raw.s2, raw.s4, raw.s6,
+                                    raw.s1, raw.s3, raw.s5, raw.s7);
+
+        // Load activations
+        if (slid < 4) {
+            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+        }
+
+        regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+        regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+        regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+        regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+
+#ifdef VECTOR_SUB_GROUP_BROADCAST
+        dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
+#else
+        dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAST
+
+        regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+        regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+        regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+        regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAST
+        dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
+#else
+        dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAST
+    }
+
+    // reduction in local memory, assumes #wave=4
+    local float2 reduceLM[SUBGROUP_SIZE * 3];
+    if (groupId == 1) {
+        reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
+    }
+    if (groupId == 2) {
+        reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
+    }
+    if (groupId == 3) {
+        reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (groupId == 0) {
+        totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
+    }
+    if (groupId == 0) {
+        totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
+    }
+    if (groupId == 0) {
+        totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
+    }
+
+    // 2 outputs per fiber in wave 0
+    if (groupId == 0) {
+        dst = (global float*)((global char*)dst + offsetd);
+        vstore2(totalSum, 0, &(dst[gid * 2]));
+    }
+
+}
@@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32(
    src1 = (global int*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);

-    int i10 = get_group_id(0);
-    int i11 = get_group_id(1);
-    int i12 = get_group_id(2);
+    int nchunks = get_num_groups(0) / ne10;
+    int g       = get_group_id(0);
+    int i10     = g / nchunks;
+    int chunk   = g - i10 * nchunks;
+    int i11     = get_group_id(1);
+    int i12     = get_group_id(2);

    int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];

    int i02 = i11;
    int i03 = i12;

-    for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
-        if (ind >= ne00) {
-            return;
-        }
-        ((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
-            ((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
+    global float * dst_row = (global float *) ((global char *) dst  + i12*nb3 + i11*nb2 + i10*nb1);
+    global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03);
+
+    int span  = (ne00 + nchunks - 1) / nchunks;
+    int start = chunk * span;
+    int end   = min(start + span, ne00);
+
+    for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) {
+        dst_row[ind] = src_row[ind];
    }
 }

@@ -33,13 +33,15 @@ inline float block_q_6_K_dot_y_flat(
    global uchar * blk_qh,
    global char  * blk_scales,
    global half  * blk_d,
-    global float * yy,
    int ib,
    int ip,
    int is,
-    int l0
+    int l0,
+    float4 y0,
+    float4 y1,
+    float4 y2,
+    float4 y3
 ) {
-    int y_offset   = 128*ip + l0;
    int q_offset_l =  64*ip + l0;
    int q_offset_h =  32*ip + l0;

@@ -48,36 +50,28 @@ inline float block_q_6_K_dot_y_flat(
    global uchar * qh = blk_qh     + ib*64 + q_offset_h;
    global char  * sc = blk_scales + ib*16 + is;

-    global float * y = yy + ib * QK_K + y_offset;
-
    float dall = blk_d[ib];

-    float  sumf = 0;
-    float4 sums = {0.f, 0.f, 0.f, 0.f};
+    // Vectorized loads: 3 uchar4 weight loads instead of 12 scalar byte reads.
+    // q_offset_l/h are 4-aligned, so these are aligned vector loads.
+    uchar4 q1v = vload4(0, q1);
+    uchar4 q2v = vload4(0, q2);
+    uchar4 qhv = vload4(0, qh);

-    sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f);
-    sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f);
-    sums.s2 += y[0+64] * ((float)((q1[0]  >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f);
-    sums.s3 += y[0+96] * ((float)((q2[0]  >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f);
+    int4 q1i = convert_int4(q1v);
+    int4 q2i = convert_int4(q2v);
+    int4 qhi = convert_int4(qhv);

-    sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f);
-    sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f);
-    sums.s2 += y[1+64] * ((float)((q1[1]  >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f);
-    sums.s3 += y[1+96] * ((float)((q2[1]  >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f);
+    // Reconstruct the four 6-bit weight groups (low/high nibble of ql OR'd with the
+    // matching 2-bit plane of qh), same arithmetic as the scalar version, then dot()
+    // against the cached activation lanes.
+    float4 w0 = convert_float4((q1i & 0xF) | ((qhi & Q6_K_MASK1) << 4)) - 32.f;
+    float4 w1 = convert_float4((q2i & 0xF) | ((qhi & Q6_K_MASK2) << 2)) - 32.f;
+    float4 w2 = convert_float4((q1i >> 4)  | ((qhi & Q6_K_MASK3)     )) - 32.f;
+    float4 w3 = convert_float4((q2i >> 4)  | ((qhi & Q6_K_MASK4) >> 2)) - 32.f;

-    sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f);
-    sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f);
-    sums.s2 += y[2+64] * ((float)((q1[2]  >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f);
-    sums.s3 += y[2+96] * ((float)((q2[2]  >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f);
-
-    sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f);
-    sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f);
-    sums.s2 += y[3+64] * ((float)((q1[3]  >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f);
-    sums.s3 += y[3+96] * ((float)((q2[3]  >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f);
-
-    sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
-
-    return sumf;
+    return dall * (dot(y0, w0) * sc[0] + dot(y1, w1) * sc[2] +
+                   dot(y2, w2) * sc[4] + dot(y3, w3) * sc[6]);
 }

 #undef N_DST
@@ -89,7 +83,7 @@ inline float block_q_6_K_dot_y_flat(
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 16
 #elif defined (ADRENO_GPU)
-#define N_DST 4
+#define N_DST 16
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 64
 #endif
@@ -146,49 +140,39 @@ kernel void kernel_mul_mv_q6_K_f32_flat(
    global half  * blk_d      = (global half  *) src0_d  + offset_src0_d;
    global float * yy         = (global float *) src1    + r1*ne10 + im*ne00*ne1;

-    int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
-    int ix  = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
+    int tid = get_sub_group_local_id()%(N_SIMDWIDTH/BLOCK_STRIDE); // within-super-block part, 0..15
+    int ix  = get_sub_group_local_id()/(N_SIMDWIDTH/BLOCK_STRIDE); // super-block selector, 0..BLOCK_STRIDE-1
    int ip  = tid/8;   // first or second half of (super) block (0 or 1)
    int il  = tid%8;   // each half has 8 parts, one per scale
    int n   = 4;       // 4 scales at a time (and 4 sums)
    int l0  = n*il;    // offset into half-block, 0..28
    int is  = 8*ip + l0/16; // 0, 1, 8, 9

-    float4 sumf = 0;
+    float sumf[N_DST];
+    for (int row = 0; row < N_DST; row++) {
+        sumf[row] = 0.f;
+    }

    for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
-        if (first_row + 0 < ne01) {
-            sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0);
-        }
-        if (first_row + 1 < ne01) {
-            sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0);
-        }
-        if (first_row + 2 < ne01) {
-            sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0);
-        }
-        if (first_row + 3 < ne01) {
-            sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0);
+        global float * y = yy + ib * QK_K + 128*ip + l0;
+        float4 y0 = vload4(0, y +  0);
+        float4 y1 = vload4(0, y + 32);
+        float4 y2 = vload4(0, y + 64);
+        float4 y3 = vload4(0, y + 96);
+
+        for (int row = 0; row < N_DST; row++) {
+            if (first_row + row < ne01) {
+                sumf[row] += block_q_6_K_dot_y_flat(
+                    blk_ql + row*nb*128, blk_qh + row*nb*64, blk_scales + row*nb*16, blk_d + row*nb,
+                    ib, ip, is, l0, y0, y1, y2, y3);
+            }
        }
    }

-    float4 tot = (float4)(
-        sub_group_reduce_add(sumf.s0),
-        sub_group_reduce_add(sumf.s1),
-        sub_group_reduce_add(sumf.s2),
-        sub_group_reduce_add(sumf.s3)
-    );
-    if (get_sub_group_local_id() == 0) {
-        if (first_row + 0 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
-        }
-        if (first_row + 1 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
-        }
-        if (first_row + 2 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
-        }
-        if (first_row + 3 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+    for (int row = 0; row < N_DST; row++) {
+        float tot = sub_group_reduce_add(sumf[row]);
+        if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
        }
    }
 }
@@ -59,7 +59,7 @@ bool gpu_has_xmx(sycl::device &dev) {
    return dev.has(sycl::aspect::ext_intel_matrix);
 }

-static int ggml_sycl_get_env(const char *env_name, int default_val) {
+int ggml_sycl_get_env(const char *env_name, int default_val) {
    char *user_device_string = getenv(env_name);
    int user_number = default_val;

@@ -86,7 +86,7 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block

 #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
 static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
-    return ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1) &&
+    return g_ggml_sycl_enable_level_zero &&
        q.get_device().is_gpu() &&
        q.get_backend() == sycl::backend::ext_oneapi_level_zero;
 }
@@ -94,8 +94,6 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {

 // Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
 // DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
-// The decision is made from the queue and runtime env because large buffers can be
-// allocated before ggml_check_sycl() initializes g_ggml_sycl_enable_level_zero.
 void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
 #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
    if (ggml_sycl_use_level_zero_device_alloc(q)) {
@@ -225,6 +225,7 @@ struct sycl_device_info {
    int max_wg_per_cu; // max work groups per compute unit - refer to
                       // cudaOccupancyMaxActiveBlocksPerMultiprocessor
    bool    vmm;                // virtual memory support
+    bool    l0_discrete_gpu;    // Level Zero backend and not an integrated GPU
    size_t  vmm_granularity;    // granularity of virtual memory
    size_t  total_vram;
    sycl_hw_info hw_info;
@@ -644,6 +645,8 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {

 bool gpu_has_xmx(sycl::device &dev);

+int ggml_sycl_get_env(const char *env_name, int default_val);
+
 template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
    if (LIKELY(!g_ggml_sycl_debug)) {
        return "";
@@ -48,6 +48,287 @@ inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
    }
 }

+inline void cpy_blck_f32_q1_0(const char * cxi, char * cdsti) {
+    const float * xi   = (const float *) cxi;
+    block_q1_0 *  dsti = (block_q1_0 *) cdsti;
+
+    float sum_abs = 0.0f;
+    for (int j = 0; j < QK1_0; ++j) {
+        sum_abs += sycl::fabs((float) xi[j]);
+    }
+
+    dsti->d = sum_abs / QK1_0;
+
+    for (int j = 0; j < QK1_0 / 8; ++j) {
+        dsti->qs[j] = 0;
+    }
+
+    for (int j = 0; j < QK1_0; ++j) {
+        if (xi[j] >= 0.0f) {
+            dsti->qs[j / 8] |= (1u << (j % 8));
+        }
+    }
+}
+
+inline int best_index_mxfp4(const float x, const float e) {
+    int best_index = 0;
+    float best_err = sycl::fabs((float) (kvalues_mxfp4[0] * e - x));
+    for (int i = 1; i < 16; ++i) {
+        const float err = sycl::fabs((float) (kvalues_mxfp4[i] * e - x));
+        if (err < best_err) {
+            best_index = i;
+            best_err = err;
+        }
+    }
+    return best_index;
+}
+
+inline int nearest_int_sycl(float x) {
+    const float val = x + 12582912.0f;
+    int i;
+    memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+inline int nearest_int_ggml_sycl(float x) {
+    return (int) sycl::round((float) x);
+}
+
+inline uint8_t clamp_u8(const int x, const int lo, const int hi) {
+    return (uint8_t) dpct::max(lo, dpct::min(hi, x));
+}
+
+inline int8_t clamp_i8(const int x, const int lo, const int hi) {
+    return (int8_t) dpct::max(lo, dpct::min(hi, x));
+}
+
+constexpr float GROUP_MAX_EPS_SYCL = 1e-15f;
+
+inline float make_qx_quants_sycl(int n, int nmax, const float * x, int8_t * L, int rmse_type, const float * qw) {
+    float max = 0.0f;
+    float amax = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        const float ax = sycl::fabs(x[i]);
+        if (ax > amax) {
+            amax = ax;
+            max = x[i];
+        }
+    }
+    if (amax < GROUP_MAX_EPS_SYCL) {
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.0f;
+    }
+
+    float iscale = -nmax / max;
+    if (rmse_type == 0) {
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int_ggml_sycl(iscale * x[i]);
+            L[i] = (int8_t) (nmax + dpct::max(-nmax, dpct::min(nmax - 1, l)));
+        }
+        return 1.0f / iscale;
+    }
+
+    bool return_early = false;
+    if (rmse_type < 0) {
+        rmse_type = -rmse_type;
+        return_early = true;
+    }
+
+    float sumlx = 0.0f;
+    float suml2 = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int_ggml_sycl(iscale * x[i]);
+        l = dpct::max(-nmax, dpct::min(nmax - 1, l));
+        L[i] = (int8_t) (l + nmax);
+
+        const float w = qw ? qw[i] : (rmse_type == 1 ? x[i] * x[i] :
+            rmse_type == 2 ? 1.0f : rmse_type == 3 ? sycl::fabs(x[i]) : sycl::sqrt(sycl::fabs(x[i])));
+
+        sumlx += w * x[i] * l;
+        suml2 += w * l * l;
+    }
+
+    float scale = suml2 ? sumlx / suml2 : 0.0f;
+    if (return_early) {
+        return suml2 > 0.0f ? 0.5f * (scale + 1.0f / iscale) : 1.0f / iscale;
+    }
+
+    float best = scale * sumlx;
+    for (int is = -9; is <= 9; ++is) {
+        if (is == 0) {
+            continue;
+        }
+        iscale = -(nmax + 0.1f * is) / max;
+        sumlx = 0.0f;
+        suml2 = 0.0f;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int_ggml_sycl(iscale * x[i]);
+            l = dpct::max(-nmax, dpct::min(nmax - 1, l));
+            const float w = qw ? qw[i] : (rmse_type == 1 ? x[i] * x[i] :
+                rmse_type == 2 ? 1.0f : rmse_type == 3 ? sycl::fabs(x[i]) : sycl::sqrt(sycl::fabs(x[i])));
+            sumlx += w * x[i] * l;
+            suml2 += w * l * l;
+        }
+
+        if (suml2 > 0.0f && sumlx * sumlx > best * suml2) {
+            for (int i = 0; i < n; ++i) {
+                int l = nearest_int_ggml_sycl(iscale * x[i]);
+                L[i] = (int8_t) (nmax + dpct::max(-nmax, dpct::min(nmax - 1, l)));
+            }
+            scale = sumlx / suml2;
+            best = scale * sumlx;
+        }
+    }
+
+    return scale;
+}
+
+inline float make_q3_quants_sycl(int n, int nmax, const float * x, int8_t * L, bool do_rmse) {
+    float max = 0.0f;
+    float amax = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        const float ax = sycl::fabs(x[i]);
+        if (ax > amax) {
+            amax = ax;
+            max = x[i];
+        }
+    }
+
+    if (amax < GROUP_MAX_EPS_SYCL) {
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.0f;
+    }
+
+    const float iscale = -nmax / max;
+    if (do_rmse) {
+        float sumlx = 0.0f;
+        float suml2 = 0.0f;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int_ggml_sycl(iscale * x[i]);
+            l = dpct::max(-nmax, dpct::min(nmax - 1, l));
+            L[i] = (int8_t) l;
+            const float w = x[i] * x[i];
+            sumlx += w * x[i] * l;
+            suml2 += w * l * l;
+        }
+
+        for (int itry = 0; itry < 5; ++itry) {
+            int n_changed = 0;
+            for (int i = 0; i < n; ++i) {
+                const float w = x[i] * x[i];
+                float slx = sumlx - w * x[i] * L[i];
+                if (slx > 0.0f) {
+                    float sl2 = suml2 - w * L[i] * L[i];
+                    int new_l = nearest_int_ggml_sycl(x[i] * sl2 / slx);
+                    new_l = dpct::max(-nmax, dpct::min(nmax - 1, new_l));
+                    if (new_l != L[i]) {
+                        slx += w * x[i] * new_l;
+                        sl2 += w * new_l * new_l;
+                        if (sl2 > 0.0f && slx * slx * suml2 > sumlx * sumlx * sl2) {
+                            L[i] = (int8_t) new_l;
+                            sumlx = slx;
+                            suml2 = sl2;
+                            ++n_changed;
+                        }
+                    }
+                }
+            }
+            if (!n_changed) {
+                break;
+            }
+        }
+
+        for (int i = 0; i < n; ++i) {
+            L[i] += nmax;
+        }
+        return suml2 > 0.0f ? sumlx / suml2 : 0.0f;
+    }
+
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int_ggml_sycl(iscale * x[i]);
+        l = dpct::max(-nmax, dpct::min(nmax - 1, l));
+        L[i] = (int8_t) (l + nmax);
+    }
+
+    return 1.0f / iscale;
+}
+
+inline void set_scale_min_k4(int j, uint8_t * q, uint8_t d, uint8_t m) {
+    if (j < 4) {
+        q[j]     = (q[j] & 0xC0) | (d & 0x3F);
+        q[j + 4] = (q[j + 4] & 0xC0) | (m & 0x3F);
+    } else {
+        q[j + 4] = (d & 0x0F) | ((m & 0x0F) << 4);
+        q[j - 4] = (q[j - 4] & 0x3F) | ((d >> 4) << 6);
+        q[j - 0] = (q[j - 0] & 0x3F) | ((m >> 4) << 6);
+    }
+}
+
+inline void get_scale_min_k4_local(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63;
+        m = q[j + 4] & 63;
+    } else {
+        d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
+        m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
+    }
+}
+
+inline void cpy_blck_f32_mxfp4(const char * cxi, char * cdsti) {
+    const float *   xi   = (const float *) cxi;
+    block_mxfp4 *   dsti = (block_mxfp4 *) cdsti;
+
+    float amax = 0.0f;
+    for (int j = 0; j < QK_MXFP4; ++j) {
+        amax = sycl::fmax(amax, sycl::fabs((float) xi[j]));
+    }
+
+    const uint8_t e = amax > 0.0f ? (uint8_t) (sycl::floor(sycl::log2(amax)) - 2 + 127) : 0;
+    const float d = GGML_E8M0_TO_FP32_HALF(e);
+
+    dsti->e = e;
+
+    for (int j = 0; j < QK_MXFP4 / 2; ++j) {
+        const uint8_t x0 = best_index_mxfp4(xi[0 + j], d);
+        const uint8_t x1 = best_index_mxfp4(xi[QK_MXFP4 / 2 + j], d);
+
+        dsti->qs[j]  = x0;
+        dsti->qs[j] |= x1 << 4;
+    }
+}
+
+inline void cpy_blck_f32_nvfp4(const char * cxi, char * cdsti) {
+    const float *   xi   = (const float *) cxi;
+    block_nvfp4 *   dsti = (block_nvfp4 *) cdsti;
+
+    constexpr int n_sub = QK_NVFP4 / QK_NVFP4_SUB;
+
+    for (int s = 0; s < n_sub; ++s) {
+        const float * xb = xi + s * QK_NVFP4_SUB;
+
+        float amax = 0.0f;
+        for (int j = 0; j < QK_NVFP4_SUB; ++j) {
+            amax = sycl::fmax(amax, sycl::fabs((float) xb[j]));
+        }
+
+        const uint8_t ue = ggml_fp32_to_ue4m3(amax / 6.0f);
+        dsti->d[s] = ue;
+        const float d = ggml_ue4m3_to_fp32(ue);
+
+        for (int j = 0; j < QK_NVFP4_SUB / 2; ++j) {
+            const uint8_t x0 = best_index_mxfp4(xb[0 + j], d);
+            const uint8_t x1 = best_index_mxfp4(xb[QK_NVFP4_SUB / 2 + j], d);
+
+            dsti->qs[s * (QK_NVFP4_SUB / 2) + j] = x0 | (x1 << 4);
+        }
+    }
+}
+
+
 inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
    const float * xi   = (const float *) cxi;
    block_q4_0 *  dsti = (block_q4_0 *) cdsti;
@@ -44,9 +44,9 @@ void gated_delta_net_sycl(const float *     q,
    float *       attn_data        = dst;
    float *       state            = dst + attn_score_elems;

-    // input state layout (D, K, n_seqs) — seq stride is K * D = K * H * S_v * S_v.
+    // input state holds s0 only [S_v, S_v, H, n_seqs] — seq stride is D = H * S_v * S_v.
    // output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
-    const int64_t state_in_offset      = sequence * K * H * S_v * S_v + h_idx * S_v * S_v;
+    const int64_t state_in_offset      = sequence * H * S_v * S_v + h_idx * S_v * S_v;
    const int64_t state_out_offset     = (sequence * H + h_idx) * S_v * S_v;
    const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
    state += state_out_offset;
@@ -63,9 +63,8 @@ void gated_delta_net_sycl(const float *     q,
        s_shard[r]  = curr_state[i];
    }

-    // slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
-    // are written; earlier slots are left untouched (caller-owned).
-    const int shift = (int) n_tokens - K;
+    // snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
+    // When n_tokens < K only slots 0..n_tokens-1 are written; older slots are caller-owned.

    for (int t = 0; t < n_tokens; t++) {
        const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1;
@@ -144,7 +143,7 @@ void gated_delta_net_sycl(const float *     q,

    // Write state back to global memory
        if constexpr (keep_rs_t) {
-            const int target_slot = t - shift;
+            const int target_slot = (int) n_tokens - 1 - t;
            if (target_slot >= 0 && target_slot < K) {
                float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
 #pragma unroll
@@ -315,8 +314,8 @@ void ggml_sycl_op_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor *

    dpct::queue_ptr stream = ctx.stream();

-    // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
-    const int K = (int) src_state->ne[1];
+    // K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs].
+    const int K = ggml_get_op_params_i32(dst, 0);
    const bool keep_rs = K > 1;

    if (kda) {
@@ -70,6 +70,7 @@
 #include "ggml-sycl/diag.hpp"
 #include "ggml-sycl/solve_tri.hpp"
 #include "ggml-sycl/gated_delta_net.hpp"
+#include "ggml-sycl/pool.hpp"

 static bool g_sycl_loaded = false;
 int g_ggml_sycl_debug = 0;
@@ -147,11 +148,31 @@ static ggml_sycl_device_info ggml_sycl_init() {
            GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
            info.ext_oneapi_level_zero = false;
        }
+
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+        if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) {
+            ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device.default_queue().get_device());
+            ze_device_properties_t props = {};
+            props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+            ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
+            info.devices[i].l0_discrete_gpu = r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
+        }
+#endif
    }

    for (int id = 0; id < info.device_count; ++id) {
        info.default_tensor_split[id] /= total_vram;
    }
+
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+    // Large buffers can be allocated before ggml_check_sycl() initializes other
+    // g_ggml_sycl_enable_* globals, so initialize this one as early as we can.
+    g_ggml_sycl_enable_level_zero =
+        info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
+#else
+    g_ggml_sycl_enable_level_zero = 0;
+#endif
+
    return info;
 }

@@ -236,38 +257,19 @@ void ggml_backend_sycl_print_sycl_devices() {
    print_device_opt_feature(device_count);
 }

-static inline int get_sycl_env(const char *env_name, int default_val) {
-    char *user_device_string = getenv(env_name);
-    int user_number = default_val;
-
-    unsigned n;
-    if (user_device_string != NULL &&
-        sscanf(user_device_string, " %u", &n) == 1) {
-        user_number = (int)n;
-    } else {
-        user_number = default_val;
-    }
-    return user_number;
-}
-
 static void ggml_check_sycl() try {
    static bool initialized = false;

    if (!initialized) {
-        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-        g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
-        g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
-        g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
-        g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
-        g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
-        g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
-#else
-        g_ggml_sycl_enable_level_zero = 0;
-#endif
+        g_ggml_sycl_debug = ggml_sycl_get_env("GGML_SYCL_DEBUG", 0);
+        g_ggml_sycl_disable_optimize = ggml_sycl_get_env("GGML_SYCL_DISABLE_OPT", 0);
+        g_ggml_sycl_disable_graph = ggml_sycl_get_env("GGML_SYCL_DISABLE_GRAPH", 1);
+        g_ggml_sycl_disable_dnn = ggml_sycl_get_env("GGML_SYCL_DISABLE_DNN", 0);
+        g_ggml_sycl_enable_vmm = ggml_sycl_get_env("GGML_SYCL_ENABLE_VMM", 1);
+        g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);

 #ifdef SYCL_FLASH_ATTN
-        g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
+        g_ggml_sycl_enable_flash_attention = ggml_sycl_get_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
 #else
        g_ggml_sycl_enable_flash_attention = 0;
 #endif
@@ -330,7 +332,7 @@ static void ggml_check_sycl() try {
        GGML_LOG_INFO("  GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
 #endif
        GGML_LOG_INFO("  GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
-        g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
+        g_ggml_sycl_use_async_mem_op_requested = ggml_sycl_get_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
        GGML_LOG_INFO("  GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);

 #ifdef SYCL_FLASH_ATTN
@@ -569,26 +571,18 @@ catch (sycl::exception const &exc) {
 }

 #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
-static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
-    if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
-        return false;
-    }
-
-    ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
-    ze_device_properties_t props = {};
-    props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-    ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
-    return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
+static bool ggml_sycl_is_l0_discrete_gpu(int device) {
+    return ggml_sycl_info().devices[device].l0_discrete_gpu;
 }
 #endif

-static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
+static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
                    const void *ptr_src, size_t size) {
 #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
    // Use Level Zero direct copy for dGPU-to-dGPU transfers.
-    const bool l0_copy_supported =
-        ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
-    if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
+    const bool l0_copy_supported = g_ggml_sycl_enable_level_zero &&
+        ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
+    if (l0_copy_supported) {
        auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
        auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
        ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
@@ -651,7 +645,7 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
        size_t size = ggml_nbytes(src);

        //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs.
-        dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size);
+        dev2dev_memcpy(dst_ctx->device, *stream_dst, src_ctx->device, *stream_src, dst->data, src->data, size);

 //todo, it's known issue：error in device2device cross GPUs. reused when the issue is fixed. DON"T remove
 #if 0
@@ -1947,69 +1941,6 @@ static void scale_f32(const float * x, float * dst, const float scale, const flo
 }


-template <typename Ti, typename To>
-static  void pool2d_nchw_kernel(
-        const int ih, const int iw, const int oh, const int ow,
-        const int kh, const int kw, const int sh, const int sw,
-        const int ph, const int pw, const int parallel_elements,
-        const Ti* src, To* dst, const enum ggml_op_pool op,
-        const sycl::nd_item<3> &item_ct1) {
-        int idx = item_ct1.get_local_id(2) +
-                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
-        if (idx >= parallel_elements) {
-            return;
-        }
-
-        const int I_HW = ih * iw;
-        const int O_HW = oh * ow;
-        const int nc = idx / O_HW;
-        const int cur_oh = idx % O_HW / ow;
-        const int cur_ow = idx % O_HW % ow;
-        const Ti* i_ptr = src + nc * I_HW;
-        To* o_ptr = dst + nc * O_HW;
-        const int start_h = cur_oh * sh - ph;
-        const int bh = sycl::max(0, start_h);
-        const int eh = sycl::min(ih, start_h + kh);
-        const int start_w = cur_ow * sw - pw;
-        const int bw = sycl::max(0, start_w);
-        const int ew = sycl::min(iw, start_w + kw);
-
-        To res = 0;
-
-        switch (op) {
-            case GGML_OP_POOL_AVG: res = 0; break;
-            case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
-            default:
-                res      = (To) sycl::nan(uint32_t(0));
-                break;
-        }
-
-        for (int i = bh; i < eh; i += 1) {
-            for (int j = bw; j < ew; j += 1) {
-#if DPCT_COMPATIBILITY_TEMP >= 350
-                /*
-                DPCT1098:106: The '*' expression is used instead of the __ldg
-                call. These two expressions do not provide the exact same
-                functionality. Check the generated code for potential precision
-                and/or performance issues.
-                */
-                Ti cur = *(i_ptr + i * iw + j);
-#else
-                Ti cur = i_ptr[i * iw + j];
-#endif
-                switch (op) {
-                    case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
-                    case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
-                    default:
-                        res = (To) sycl::nan(uint32_t(0));
-                        break;
-                }
-            }
-        }
-        o_ptr[cur_oh * ow + cur_ow] = res;
-}
-
-
 static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
                                           float *dst, const int ncols_x,
                                           const int nrows_x,
@@ -2558,45 +2489,6 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }

-static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    dpct::queue_ptr main_stream = ctx.stream();
-    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
-    const int k0 = opts[1];
-    const int k1 = opts[2];
-    const int s0 = opts[3];
-    const int s1 = opts[4];
-    const int p0 = opts[5];
-    const int p1 = opts[6];
-
-    const int64_t IH = dst->src[0]->ne[1];
-    const int64_t IW = dst->src[0]->ne[0];
-
-    const int64_t N = dst->ne[3];
-    const int64_t OC = dst->ne[2];
-    const int64_t OH = dst->ne[1];
-    const int64_t OW = dst->ne[0];
-
-    const int parallel_elements = N * OC * OH * OW;
-    const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE;
-    sycl::range<3> block_nums(1, 1, num_blocks);
-    main_stream->parallel_for(
-        sycl::nd_range<3>(block_nums *
-                              sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0,
-                               parallel_elements, src0_dd, dst_dd, op,
-                               item_ct1);
-        });
-}
-
 inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -3056,7 +2948,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                            src1_ddf_i_source += (i0 * ne11 + src1_col_0) * ne10;

                            SYCL_CHECK(
-                                CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source,
+                                CHECK_TRY_ERROR(dev2dev_memcpy(i, *stream, ctx.device, *main_stream, src1_ddf_i, src1_ddf_i_source,
                                                               src1_ncols * ne10 * sizeof(float))));
                        }
                    }
@@ -3971,7 +3863,9 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten
    return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
            ctx.opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf.
            dst->op == GGML_OP_MUL_MAT &&   //limit to some supported cases of Q4_0, to do for more cases.
-            dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
+            // ne[1] <= 8 so multi-column decode (spec / MTP verify) also bootstraps the reorder;
+            // all reorderable types have a _switch_ncols kernel.
+            dst->src[1]->ne[1] <= 8 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
 }

 static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
@@ -4433,6 +4327,11 @@ static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
    ggml_sycl_op_pool2d(ctx, dst);
 }

+static void ggml_sycl_pool1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    ggml_sycl_op_pool1d(ctx, dst);
+}
+
 static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
    ggml_sycl_op_im2col(ctx, dst);
@@ -4746,6 +4645,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_POOL_2D:
            ggml_sycl_pool2d(ctx, dst);
            break;
+        case GGML_OP_POOL_1D:
+            ggml_sycl_pool1d(ctx, dst);
+            break;
        case GGML_OP_SUM:
            ggml_sycl_sum(ctx, dst);
            break;
@@ -5340,10 +5242,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g

        case GGML_OP_SET_ROWS:
            {
-                return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
+
+                auto res = ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
                         op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 ||
-                         op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) &&
+                         op->type == GGML_TYPE_Q1_0 ||
+                         op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL ||
+                         op->type == GGML_TYPE_MXFP4 || op->type == GGML_TYPE_NVFP4) &&
+                        op->src[0]->type == GGML_TYPE_F32 &&
                        (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32));
+                return res;
            }
            break;
        case GGML_OP_CPY:
@@ -5500,6 +5407,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                k > 0 && k <= 32;
        }
        case GGML_OP_POOL_2D:
+        case GGML_OP_POOL_1D:
            return true;
        case GGML_OP_ACC:
            return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
--- a/Show More
+++ b/Show More