context : fix logits size overflow for huge batches

context : fix overflow when re-ordering huge outputs
2026-06-30 17:47:40 +02:00 · 2025-08-04 22:26:46 -04:00 · 2025-08-04 22:01:28 -04:00
214 changed files with 3079 additions and 19669 deletions
@@ -0,0 +1,22 @@
+node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
+    stage('Cleanup'){
+        cleanWs()               // Cleaning previous CI build in workspace
+    }
+    stage('checkout repo'){
+        retry(5){               // Retry if the cloning fails due to some reason
+            checkout scm        // Clone the repo on Runner
+        }
+    }
+    stage('Compiling llama.cpp'){
+        sh'''#!/bin/bash
+            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
+        '''
+    }
+    stage('Running llama.cpp'){
+        sh'''#!/bin/bash
+            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
+            cat llama_log.txt                   # Printing results
+        '''
+    }
+}
@@ -4,6 +4,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

+ARG GGML_CPU_ARM_ARCH=armv8-a
+
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev

@@ -11,8 +13,10 @@ WORKDIR /app

 COPY . .

-RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+    elif [ "$TARGETARCH" = "arm64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
    else \
        echo "Unsupported architecture"; \
        exit 1; \
@@ -60,7 +60,8 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    && pip install --break-system-packages -r requirements.txt \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -40,7 +40,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
        multiple: true
    validations:
      required: true
@@ -42,7 +42,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
        multiple: true
    validations:
      required: true
@@ -22,11 +22,6 @@ Vulkan:
        - any-glob-to-any-file:
            - ggml/include/ggml-vulkan.h
            - ggml/src/ggml-vulkan/**
-IBM zDNN:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-zdnn.h
-            - ggml/src/ggml-zdnn/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@@ -1,43 +0,0 @@
-name: Build on RISCV Linux Machine by Cloud-V
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  bianbu-riscv64-native: # Bianbu 2.2
-    runs-on: self-hosted
-
-    steps:
-      - name: Install prerequisites
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y libatomic1
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  cmake
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
@@ -64,7 +64,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
@@ -104,7 +104,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
@@ -144,7 +144,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64-webgpu
          evict-old-files: 1d
@@ -199,7 +199,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-cmake
          evict-old-files: 1d
@@ -251,7 +251,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
          evict-old-files: 1d
@@ -330,7 +330,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-cmake-rpc
          evict-old-files: 1d
@@ -363,7 +363,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d
@@ -400,7 +400,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-webgpu
          evict-old-files: 1d
@@ -443,7 +443,7 @@ jobs:

  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.1.2
+    container: rocm/dev-ubuntu-22.04:6.0.2

    steps:
      - name: Clone
@@ -457,7 +457,7 @@ jobs:
          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-hip
          evict-old-files: 1d
@@ -471,6 +471,16 @@ jobs:
            -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)

+      - name: Build with legacy HIP support
+        id: cmake_build_legacy_hip
+        run: |
+          cmake -B build2 -S . \
+            -DCMAKE_C_COMPILER=hipcc \
+            -DCMAKE_CXX_COMPILER=hipcc \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGGML_HIP=ON
+          cmake --build build2 --config Release -j $(nproc)
+
  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
@@ -487,7 +497,7 @@ jobs:
          apt-get install -y build-essential git cmake libcurl4-openssl-dev

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-musa
          evict-old-files: 1d
@@ -532,7 +542,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-sycl
          evict-old-files: 1d
@@ -580,7 +590,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-sycl-fp16
          evict-old-files: 1d
@@ -611,7 +621,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-ios
          evict-old-files: 1d
@@ -648,7 +658,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-tvos
          evict-old-files: 1d
@@ -720,7 +730,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-swift
          evict-old-files: 1d
@@ -766,7 +776,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-msys2
          variant: ccache
@@ -834,7 +844,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-${{ matrix.build }}
          variant: ccache
@@ -948,7 +958,7 @@ jobs:
              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev

        - name: ccache
-          uses: ggml-org/ccache-action@v1.2.16
+          uses: hendrikmuhs/ccache-action@v1.2.16
          with:
            key: ubuntu-latest-cmake-cuda
            evict-old-files: 1d
@@ -977,7 +987,7 @@ jobs:
        uses: actions/checkout@v4

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -1033,7 +1043,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-sycl
          variant: ccache
@@ -1079,7 +1089,7 @@ jobs:
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ${{ github.job }}
          evict-old-files: 1d
@@ -1113,11 +1123,6 @@ jobs:
      - name: Checkout code
        uses: actions/checkout@v4

-      - name: Setup Xcode
-        uses: maxim-lobanov/setup-xcode@v1
-        with:
-          xcode-version: latest-stable
-
      - name: Build
        id: cmake_build
        run: |
@@ -1151,7 +1156,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: android-build
          evict-old-files: 1d
@@ -1,53 +0,0 @@
-name: "Copilot Setup Steps"
-
-# Automatically run the setup steps when they are changed to allow for easy validation, and
-# allow manual testing through the repository's "Actions" tab
-on:
-  workflow_dispatch:
-  push:
-    paths:
-      - .github/workflows/copilot-setup-steps.yml
-  pull_request:
-    paths:
-      - .github/workflows/copilot-setup-steps.yml
-
-jobs:
-  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
-  copilot-setup-steps:
-    runs-on: ubuntu-latest
-
-    # Set the permissions to the lowest permissions possible needed for your steps.
-    # Copilot will be given its own token for its operations.
-    permissions:
-      # If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
-      contents: read
-
-    # You can define any steps you want, and they will run before the agent starts.
-    # If you do not check out your code, Copilot will do this for you.
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: copilot-setup-steps
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install Python dependencies
-        run: |
-          python3 -m venv .venv
-          .venv/bin/activate
-          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
-          pip install flake8 pyright
@@ -32,7 +32,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
@@ -85,7 +85,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
@@ -147,7 +147,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-cmake
          evict-old-files: 1d
@@ -198,7 +198,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d
@@ -256,7 +256,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-cpu-${{ matrix.arch }}
          variant: ccache
@@ -328,7 +328,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
@@ -398,7 +398,7 @@ jobs:
        uses: actions/checkout@v4

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -471,7 +471,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-sycl
          variant: ccache
@@ -545,7 +545,7 @@ jobs:
          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
          evict-old-files: 1d
@@ -600,7 +600,7 @@ jobs:
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip

  ios-xcode-build:
-    runs-on: macos-15
+    runs-on: macos-latest

    steps:
      - name: Checkout code
@@ -608,10 +608,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Xcode
-        run: |
-          sudo xcode-select -s /Applications/Xcode_16.4.app
-
      - name: Build
        id: cmake_build
        run: |
@@ -12,8 +12,6 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()

-message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
-
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

@@ -10,4 +10,3 @@
 /ggml/src/ggml-opt.cpp @JohannesGaessler
 /ggml/src/gguf.cpp @JohannesGaessler
 /ggml/src/ggml-vulkan/ @0cc4m
-/ggml/src/ggml-zdnn/ @taronaeo
@@ -17,8 +17,6 @@ LLM inference in C/C++

 ## Hot topics

- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
 - Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
@@ -241,7 +239,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Infrastructure</summary>

- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
+- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
@@ -749,39 +749,6 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
 // utils
 //

-// Helper function to parse tensor buffer override strings
-static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
-    std::map<std::string, ggml_backend_buffer_type_t> buft_list;
-    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-        auto * dev = ggml_backend_dev_get(i);
-        auto * buft = ggml_backend_dev_buffer_type(dev);
-        if (buft) {
-            buft_list[ggml_backend_buft_name(buft)] = buft;
-        }
-    }
-
-    for (const auto & override : string_split<std::string>(value, ',')) {
-        std::string::size_type pos = override.find('=');
-        if (pos == std::string::npos) {
-            throw std::invalid_argument("invalid value");
-        }
-        std::string tensor_name = override.substr(0, pos);
-        std::string buffer_type = override.substr(pos + 1);
-
-        if (buft_list.find(buffer_type) == buft_list.end()) {
-            printf("Available buffer types:\n");
-            for (const auto & it : buft_list) {
-                printf("  %s\n", ggml_backend_buft_name(it.second));
-            }
-            throw std::invalid_argument("unknown buffer type");
-        }
-        // keep strings alive and avoid leaking memory by storing them in a static vector
-        static std::list<std::string> buft_overrides;
-        buft_overrides.push_back(tensor_name);
-        overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
-    }
-}
-
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;
@@ -1026,10 +993,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        params.tensor_buft_overrides.push_back({nullptr, nullptr});
    }

-    if (!params.speculative.tensor_buft_overrides.empty()) {
-        params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
-    }
-
    if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
        throw std::runtime_error(string_format(
            "error: the supplied chat template is not supported: %s%s\n",
@@ -1238,7 +1201,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
            common_params_print_completion(ctx_arg);
            exit(0);
        }
-        params.lr.init();
    } catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        ctx_arg.params = params_org;
@@ -1507,14 +1469,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.swa_full = true;
        }
    ).set_env("LLAMA_ARG_SWA_FULL"));
-    add_opt(common_arg(
-        {"--swa-checkpoints"}, "N",
-        string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
-            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
-        [](common_params & params, int value) {
-            params.n_swa_checkpoints = value;
-        }
-    ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--kv-unified", "-kvu"},
        string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -2395,15 +2349,40 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
        "override tensor buffer type", [](common_params & params, const std::string & value) {
-            parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
+            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+            if (buft_list.empty()) {
+                // enumerate all the devices and add their buffer types to the list
+                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                    auto * dev = ggml_backend_dev_get(i);
+                    auto * buft = ggml_backend_dev_buffer_type(dev);
+                    if (buft) {
+                        buft_list[ggml_backend_buft_name(buft)] = buft;
+                    }
+                }
+            }
+
+            for (const auto & override : string_split<std::string>(value, ',')) {
+                std::string::size_type pos = override.find('=');
+                if (pos == std::string::npos) {
+                    throw std::invalid_argument("invalid value");
+                }
+                std::string tensor_name = override.substr(0, pos);
+                std::string buffer_type = override.substr(pos + 1);
+
+                if (buft_list.find(buffer_type) == buft_list.end()) {
+                    printf("Available buffer types:\n");
+                    for (const auto & it : buft_list) {
+                        printf("  %s\n", ggml_backend_buft_name(it.second));
+                    }
+                    throw std::invalid_argument("unknown buffer type");
+                }
+                // keep strings alive and avoid leaking memory by storing them in a static vector
+                static std::list<std::string> buft_overrides;
+                buft_overrides.push_back(tensor_name);
+                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
+            }
        }
    ));
-    add_opt(common_arg(
-        {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
-        "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
-            parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--cpu-moe", "-cmoe"},
        "keep all Mixture of Experts (MoE) weights in the CPU",
@@ -2426,27 +2405,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_env("LLAMA_ARG_N_CPU_MOE"));
-    add_opt(common_arg(
-        {"--cpu-moe-draft", "-cmoed"},
-        "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
-        [](common_params & params) {
-            params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
-    add_opt(common_arg(
-        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
-        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
-        [](common_params & params, int value) {
-            if (value < 0) {
-                throw std::invalid_argument("invalid value");
-            }
-            for (int i = 0; i < value; ++i) {
-                static std::list<std::string> buft_overrides_draft;
-                buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
-                params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",
@@ -2697,7 +2655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.out_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
    add_opt(common_arg(
        {"-ofreq", "--output-frequency"}, "N",
        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2989,9 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
        "- none: leaves thoughts unparsed in `message.content`\n"
        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
-        "(default: auto)",
+        "(default: deepseek)",
        [](common_params & params, const std::string & value) {
-            params.reasoning_format = common_reasoning_format_from_name(value);
+            /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
+            else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
+            else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
+            else { throw std::invalid_argument("invalid value"); }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
    add_opt(common_arg(
@@ -3172,7 +3133,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-tbd", "--threads-batch-draft"}, "N",
        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
@@ -3182,7 +3143,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Cd", "--cpu-mask-draft"}, "M",
        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
@@ -3575,51 +3536,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));


-    add_opt(
-        common_arg({ "-lr", "--learning-rate" }, "ALPHA",
-                   string_format(
-                       "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
-                       (double) params.lr.lr0),
-                   [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
-            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(
-        common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
-                   string_format(
-                       "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
-                       (double) params.lr.lr_min),
-                   [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
-            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(
-        common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
-                   string_format(
-                       "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
-                       (double) params.lr.decay_epochs),
-                   [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
-            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg(
-                { "-wd", "--weight-decay" }, "WD",
-                string_format(
-                    "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
-                    (double) params.lr.wd),
-                [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
-                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
-                       string_format("fraction of data to use as validation set for training (default: %.2g).",
-                                     (double) params.val_split),
-                       [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
-                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg({ "-epochs", "--epochs" }, "N",
-                       string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
-                       [](common_params & params, int epochs) { params.lr.epochs = epochs; })
-                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
-                       [](common_params & params, const std::string & name) {
-                           params.optimizer = common_opt_get_optimizer(name.c_str());
-                           if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
-                               throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
-                           }
-                       })
-                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-
    return ctx_arg;
 }
@@ -55,15 +55,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
 bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
-    std::string arguments = "";
-    if (tool_call.contains("arguments")) {
-        if (tool_call.at("arguments").is_object()) {
-            arguments = tool_call.at("arguments").dump();
-        } else {
-            arguments = tool_call.at("arguments");
-        }
-    }
-
+    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
    return add_tool_call(name, id, arguments);
 }

@@ -126,8 +126,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
 typedef minja::chat_template common_chat_template;

 struct common_chat_templates {
-    bool add_bos;
-    bool add_eos;
    bool has_explicit_template; // Model had builtin template or template overridde was specified.
    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
    std::unique_ptr<common_chat_template> template_tool_use;
@@ -145,8 +143,6 @@ struct templates_params {
    bool enable_thinking = true;
    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
    json extra_context;
-    bool add_bos;
-    bool add_eos;
 };

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -296,7 +292,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
        }
        if (!msg.reasoning_content.empty()) {
            jmsg["reasoning_content"] = msg.reasoning_content;
-            jmsg["thinking"] = msg.reasoning_content; // gpt-oss
        }
        if (!msg.tool_name.empty()) {
            jmsg["name"] = msg.tool_name;
@@ -450,8 +445,6 @@ std::string common_chat_format_single(

    common_chat_templates_inputs inputs;
    inputs.use_jinja = use_jinja;
-    inputs.add_bos = tmpls->add_bos;
-    inputs.add_eos = tmpls->add_eos;

    std::string fmt_past_msg;
    if (!past_msg.empty()) {
@@ -473,12 +466,9 @@ std::string common_chat_format_single(
    return ss.str();
 }

-std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
+std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
    common_chat_templates_inputs inputs;
    inputs.use_jinja = use_jinja;
-    inputs.add_bos = tmpls->add_bos;
-    inputs.add_eos = tmpls->add_eos;
-    inputs.chat_template_kwargs = chat_template_kwargs;
    auto add_simple_msg = [&](auto role, auto content) {
        common_chat_msg msg;
        msg.role = role;
@@ -554,21 +544,8 @@ common_chat_templates_ptr common_chat_templates_init(
            default_template_src = CHATML_TEMPLATE_SRC;
        }
    }
-
-    // TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
-    // Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
-    if (default_template_src.find("<|channel|>") != std::string::npos
-            // search for the error message and patch it
-            && default_template_src.find("in message.content or") != std::string::npos) {
-        string_replace_all(default_template_src,
-            "{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
-            "{%- if false %}");
-    }
-
    std::string token_bos = bos_token_override;
    std::string token_eos = eos_token_override;
-    bool add_bos = false;
-    bool add_eos = false;
    if (model) {
        const auto * vocab = llama_model_get_vocab(model);
        const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -583,13 +560,9 @@ common_chat_templates_ptr common_chat_templates_init(
        };
        token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
        token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
-        add_bos = llama_vocab_get_add_bos(vocab);
-        add_eos = llama_vocab_get_add_eos(vocab);
    }
    common_chat_templates_ptr tmpls(new common_chat_templates());
    tmpls->has_explicit_template = has_explicit_template;
-    tmpls->add_bos = add_bos;
-    tmpls->add_eos = add_eos;
    try {
        tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
    } catch (const std::exception & e) {
@@ -619,8 +592,6 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
        case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
        case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
-        case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
-        case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -629,28 +600,13 @@ const char * common_chat_format_name(common_chat_format format) {
 const char * common_reasoning_format_name(common_reasoning_format format) {
    switch (format) {
        case COMMON_REASONING_FORMAT_NONE:     return "none";
-        case COMMON_REASONING_FORMAT_AUTO:     return "auto";
        case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
        case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
-        case COMMON_REASONING_FORMAT_GRANITE: return "granite";
        default:
            throw std::runtime_error("Unknown reasoning format");
    }
 }

-common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
-    if (format == "none") {
-        return COMMON_REASONING_FORMAT_NONE;
-    } else if (format == "auto") {
-        return COMMON_REASONING_FORMAT_AUTO;
-    } else if (format == "deepseek") {
-        return COMMON_REASONING_FORMAT_DEEPSEEK;
-    } else if (format == "deepseek-legacy") {
-        return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
-    }
-    throw std::runtime_error("Unknown reasoning format: " + format);
-}
-
 static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
    std::string arguments;
    if (builder.is_partial()) {
@@ -792,10 +748,10 @@ static std::string apply(
    // instead of using `chat_template_options.use_bos_token = false`, since these tokens
    // may be needed inside the template / between messages too.
    auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
-    if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
+    if (string_starts_with(result, tmpl.bos_token())) {
        result = result.substr(tmpl.bos_token().size());
    }
-    if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
+    if (string_ends_with(result, tmpl.eos_token())) {
        result = result.substr(0, result.size() - tmpl.eos_token().size());
    }
    return result;
@@ -1333,174 +1289,6 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
        tool_calls_end);
 }

-static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-    auto prompt = apply(tmpl, inputs);
-
-    data.prompt = prompt;
-    data.format = COMMON_CHAT_FORMAT_GPT_OSS;
-
-    // These special tokens are required to parse properly, so we include them
-    // even if parse_tool_calls is false.
-    data.preserved_tokens = {
-        "<|channel|>",
-        "<|constrain|>",
-        "<|message|>",
-        "<|start|>",
-        "<|end|>",
-    };
-
-    if (inputs.tools.is_array() && !inputs.tools.empty()) {
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            // tool calls can appear in commentary or analysis channels
-            auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
-
-            std::vector<std::string> tool_rules_recipient_in_role;
-            std::vector<std::string> tool_rules_recipient_in_channel;
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-
-                tool_rules_recipient_in_role.push_back(
-                    builder.add_rule(name + "-call",
-                        "\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
-                        builder.add_schema(name + "-args", parameters)
-                    )
-                );
-
-                tool_rules_recipient_in_channel.push_back(
-                    builder.add_rule(name + "-call",
-                        "\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
-                        builder.add_schema(name + "-args", parameters)
-                    )
-                );
-            });
-
-            auto recipient_in_role = builder.add_rule("recipient_in_role",
-                "\"<|start|>assistant\"? \" to=functions.\" ( " +
-                string_join(tool_rules_recipient_in_role, " | ") + " )"
-            );
-
-            auto recipient_in_channel = builder.add_rule("recipient_in_channel",
-                channel + " \" to=functions.\" ( " +
-                string_join(tool_rules_recipient_in_channel, " | ") + " )"
-            );
-
-            builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
-
-            // Trigger on tool calls that appear in the commentary channel
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-                "<\\|channel\\|>(commentary|analysis) to"
-            });
-
-            // Trigger tool calls that appear in the role section, either at the
-            // start or in the middle.
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
-                "^ to"
-            });
-
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-                "<\\|start\\|>assistant to"
-            });
-        });
-    }
-
-    return data;
-}
-static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
-    static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
-    static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
-
-    static const common_regex start_regex("<\\|start\\|>assistant");
-    static const common_regex analysis_regex("<\\|channel\\|>analysis");
-    static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
-    static const common_regex preamble_regex("<\\|channel\\|>commentary");
-    static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
-    static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
-
-    auto consume_end = [&](bool include_end = false) {
-        if (auto res = builder.try_find_literal("<|end|>")) {
-            return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
-        }
-        return builder.consume_rest();
-    };
-
-    auto handle_tool_call = [&](const std::string & name) {
-        if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
-            if (builder.syntax().parse_tool_calls) {
-                if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
-                    throw common_chat_msg_partial_exception("incomplete tool call");
-                }
-            } else if (args->is_partial) {
-                throw common_chat_msg_partial_exception("incomplete tool call");
-            }
-        }
-    };
-
-    auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
-        auto match = regex.search(input, 0, true);
-        if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
-            return match;
-        }
-        return std::nullopt;
-    };
-
-    do {
-        auto header_start_pos = builder.pos();
-        auto content_start = builder.try_find_literal("<|message|>");
-        if (!content_start) {
-            throw common_chat_msg_partial_exception("incomplete header");
-        }
-
-        auto header = content_start->prelude;
-
-        if (auto match = regex_match(tool_call1_regex, header)) {
-            auto group = match->groups[1];
-            auto name = header.substr(group.begin, group.end - group.begin);
-            handle_tool_call(name);
-            continue;
-        }
-
-        if (auto match = regex_match(tool_call2_regex, header)) {
-            auto group = match->groups[2];
-            auto name = header.substr(group.begin, group.end - group.begin);
-            handle_tool_call(name);
-            continue;
-        }
-
-        if (regex_match(analysis_regex, header)) {
-            builder.move_to(header_start_pos);
-            if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
-                builder.add_content(consume_end(true));
-            } else {
-                builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
-            }
-            continue;
-        }
-
-        if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
-            builder.add_content(consume_end());
-            continue;
-        }
-
-        // Possibly a malformed message, attempt to recover by rolling
-        // back to pick up the next <|start|>
-        LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
-        builder.move_to(header_start_pos);
-    } while (builder.try_find_regex(start_regex, std::string::npos, false));
-
-    auto remaining = builder.consume_rest();
-    if (!remaining.empty()) {
-        LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
-    }
-}
-
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
    LOG_DBG("%s\n", __func__);
    common_chat_params data;
@@ -1910,124 +1698,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
    builder.add_content(builder.consume_rest());
 }

-static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-
-    // Pass thinking context for Granite template
-    json additional_context = {
-        {"thinking", inputs.enable_thinking},
-    };
-
-    data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
-    data.format = COMMON_CHAT_FORMAT_GRANITE;
-
-    if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
-        if (!inputs.enable_thinking) {
-            data.prompt += "</think>";
-        } else {
-            data.thinking_forced_open = true;
-        }
-    }
-
-    if (!inputs.tools.is_null()) {
-        // Granite uses <|tool_call|> followed by JSON list
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            std::vector<std::string> tool_rules;
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-                tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
-"-args", {
-                    {"type", "object"},
-                    {"properties", {
-                        {"name", {{"const", name}}},
-                        {"arguments", parameters},
-                    }},
-                    {"required", json::array({"name", "arguments"})},
-                })));
-            });
-
-            auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
-            auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
-
-            if (data.thinking_forced_open) {
-                builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
-            } else {
-                builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
-            }
-
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
-                "<|tool_call|>"
-            });
-
-            data.preserved_tokens = {
-                "<think>",
-                "</think>",
-                "<response>",
-                "</response>",
-                "<|tool_call|>",
-            };
-        });
-    } else {
-        // Handle thinking tags for non-tool responses
-        if (data.thinking_forced_open && inputs.enable_thinking) {
-            data.grammar_lazy = false;
-            data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-                builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
-            });
-            data.preserved_tokens = {
-                "<think>",
-                "</think>",
-                "<response>",
-                "</response>",
-            };
-        }
-    }
-
-    return data;
-}
-
-static void common_chat_parse_granite(common_chat_msg_parser & builder) {
-    // Parse thinking tags
-    builder.try_parse_reasoning("<think>", "</think>");
-
-    // Parse response tags using regex
-    static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
-    if (auto res = builder.try_find_regex(response_regex)) {
-        // Extract the content between the tags (capture group 1)
-        auto content = builder.str(res->groups[1]);
-        builder.add_content(content);
-        builder.move_to(res->groups[0].end);
-    }
-
-    if (!builder.syntax().parse_tool_calls) {
-        builder.add_content(builder.consume_rest());
-        return;
-    }
-
-    // Look for tool calls
-    static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
-    if (auto res = builder.try_find_regex(tool_call_regex)) {
-        builder.move_to(res->groups[0].end);
-
-        // Expect JSON array of tool calls
-        auto tool_calls_data = builder.consume_json();
-        if (tool_calls_data.json.is_array()) {
-            if (!builder.add_tool_calls(tool_calls_data.json)) {
-                builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
-            }
-        } else {
-            builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
-        }
-    } else {
-        builder.add_content(builder.consume_rest());
-    }
-}
-
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@@ -2061,8 +1731,6 @@ static common_chat_params common_chat_templates_apply_jinja(
    params.enable_thinking = inputs.enable_thinking;
    params.grammar = inputs.grammar;
    params.now = inputs.now;
-    params.add_bos = tmpls->add_bos;
-    params.add_eos = tmpls->add_eos;

    params.extra_context = json::object();
    for (auto el : inputs.chat_template_kwargs) {
@@ -2099,21 +1767,11 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_command_r7b(tmpl, params);
    }

-    // Granite (IBM) - detects thinking / tools support
-    if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
-        return common_chat_params_init_granite(tmpl, params);
-    }
-
    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
    }

-    // GPT-OSS
-    if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
-        return common_chat_params_init_gpt_oss(tmpl, params);
-    }
-
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2164,7 +1822,6 @@ static common_chat_params common_chat_templates_apply_legacy(
    int alloc_size = 0;
    std::vector<llama_chat_message> chat;
    std::vector<std::string> contents;
-
    for (const auto & msg : inputs.messages) {
        auto content = msg.content;
        for (const auto & part : msg.content_parts) {
@@ -2266,12 +1923,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_COMMAND_R7B:
            common_chat_parse_command_r7b(builder);
            break;
-        case COMMON_CHAT_FORMAT_GRANITE:
-            common_chat_parse_granite(builder);
-            break;
-        case COMMON_CHAT_FORMAT_GPT_OSS:
-            common_chat_parse_gpt_oss(builder);
-            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
@@ -109,8 +109,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_GRANITE,
-    COMMON_CHAT_FORMAT_GPT_OSS,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -129,8 +127,6 @@ struct common_chat_templates_inputs {
    bool enable_thinking = true;
    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
    std::map<std::string, std::string> chat_template_kwargs;
-    bool add_bos = false;
-    bool add_eos = false;
 };

 struct common_chat_params {
@@ -187,12 +183,10 @@ std::string common_chat_format_single(
 // Returns an example of formatted chat
 std::string common_chat_format_example(
    const struct common_chat_templates * tmpls,
-    bool use_jinja,
-    const std::map<std::string, std::string> & chat_template_kwargs);
+    bool use_jinja);

 const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
-common_reasoning_format   common_reasoning_format_from_name(const std::string & format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
@@ -41,7 +41,6 @@
 #endif
 #include <locale>
 #include <windows.h>
-#include <string.h>
 #include <fcntl.h>
 #include <io.h>
 #else
@@ -1566,56 +1565,3 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std

    return result;
 }
-
-ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
-    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
-    const lr_opt &            d      = *(lr_opt *) userdata;
-    result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
-    result.sgd.wd = result.adamw.wd = d.wd;
-    return result;
-}
-
-// TODO make all command line args case-insensitive
-static inline bool eq_case_insensitive(char const* a, char const* b) {
-    return !
-#if defined(_MSC_VER)
-        _stricmp
-#else
-        strcasecmp
-#endif // defined(_MSC_VER)
-        (a, b);
-}
-
-enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
-    if (eq_case_insensitive("adamw", n)) {
-        return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
-    }
-    if (eq_case_insensitive("sgd", n)) {
-        return GGML_OPT_OPTIMIZER_TYPE_SGD;
-    }
-    return GGML_OPT_OPTIMIZER_TYPE_COUNT;
-}
-
-// TODO simplify to use just log and exp
-static float const k_log_2 = std::log(2.f);
-
-void lr_opt::init() {
-    if (lr_min > 0 && lr_min < lr0) {
-        float nhalf = std::log(lr0 / lr_min) / k_log_2;
-        float e     = epochs;
-        if (decay_epochs > 0 && decay_epochs < e) {
-            e = decay_epochs;
-        } else {
-            decay_epochs = e;
-        }
-        scale_epoch = nhalf / e;
-    }
-}
-
-float lr_opt::get_lr(float epoch) const {
-    float r = lr_min <= 0 ? lr0 :
-        epoch >= decay_epochs ? lr_min :
-        lr0 * std::pow(0.5f, epoch * scale_epoch);
-    LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
-    return r;
-}
@@ -2,17 +2,14 @@

 #pragma once

+#include "llama-cpp.h"
+
 #include <set>
-#include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
 #include <sstream>
-#include <cmath>
-
-#include "ggml-opt.h"
-#include "llama-cpp.h"

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -85,7 +82,6 @@ enum llama_example {
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
    LLAMA_EXAMPLE_DIFFUSION,
-    LLAMA_EXAMPLE_FINETUNE,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -206,7 +202,6 @@ struct common_params_speculative {
    float   p_split      =  0.1f; // speculative decoding split probability
    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -241,31 +236,10 @@ struct common_params_diffusion {

 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_AUTO,
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
-    COMMON_REASONING_FORMAT_GRANITE,         // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };

-
-struct lr_opt {
-    float    lr0          = 1e-5; // learning rate at first epoch
-    float    lr_min       = -1;
-    float    decay_epochs = -1;   // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
-    float    scale_epoch  = 0;
-    float    wd           = 0;
-    unsigned epochs       = 2;
-
-    unsigned epoch; // set by optimizer outer (epochs) loop
-    // learning rate decay - constant LR per epoch only for now
-    float get_lr(float e) const;
-    float get_lr() const { return get_lr(epoch); }
-    // must call after arg parse, before get_lr
-    void init();
-};
-
-struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
-
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
@@ -400,11 +374,6 @@ struct common_params {
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)

-    // finetune
-    struct lr_opt lr;
-    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
-    float val_split = 0.05f; // fraction of the data used for the validation set
-
    // embedding
    bool embedding         = false; // get only sentence embedding
    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
@@ -413,12 +382,11 @@ struct common_params {
    std::string cls_sep    = "\t";  // separator of classification sequences

    // server params
-    int32_t port              = 8080;         // server listens on this network port
-    int32_t timeout_read      = 600;          // http read timeout in seconds
-    int32_t timeout_write     = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
-    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
-    int32_t n_swa_checkpoints = 3;            // max number of SWA checkpoints per slot
+    int32_t port           = 8080;         // server listens on this network port
+    int32_t timeout_read   = 600;          // http read timeout in seconds
+    int32_t timeout_write  = timeout_read; // http write timeout in seconds
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
@@ -426,7 +394,7 @@ struct common_params {
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

@@ -733,6 +701,3 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 //

 ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
-
-// "adamw" or "sgd" (case insensitive)
-enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
@@ -28,14 +28,6 @@ if TYPE_CHECKING:
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
-from gguf.vocab import MistralTokenizerType, MistralVocab
-from mistral_common.tokens.tokenizers.base import TokenizerVersion
-from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
-from mistral_common.tokens.tokenizers.tekken import Tekkenizer
-from mistral_common.tokens.tokenizers.sentencepiece import (
-    SentencePieceTokenizer,
-)
-

 logger = logging.getLogger("hf-to-gguf")

@@ -89,8 +81,6 @@ class ModelBase:
    block_count: int
    tensor_map: gguf.TensorNameMap

-    is_mistral_format: bool = False
-
    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
@@ -116,17 +106,16 @@ class ModelBase:
                logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
                remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
                self.tensor_names = set(name for name in remote_tensors.keys())
-                for name, remote_tensor in remote_tensors.items():
+                for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items():
                    yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))

            self.get_tensors = get_remote_tensors
        else:
-            prefix = "model" if not self.is_mistral_format else "consolidated"
-            self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
+            self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors")
            self.is_safetensors = len(self.part_names) > 0
            if not self.is_safetensors:
                self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
-        self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
+        self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
        self.tensor_names = None
        self.metadata_override = metadata_override
        self.model_name = model_name
@@ -164,23 +153,19 @@ class ModelBase:
    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
        tensor_names_from_parts: set[str] = set()

-        if not self.is_mistral_format:
-            index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
-            index_name += ".index.json"
-            index_file = self.dir_model / index_name
+        index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
+        index_name += ".index.json"
+        index_file = self.dir_model / index_name

-            if index_file.is_file():
-                self.tensor_names = set()
-                logger.info(f"gguf: loading model weight map from '{index_name}'")
-                with open(index_file, "r", encoding="utf-8") as f:
-                    index: dict[str, Any] = json.load(f)
-                    weight_map = index.get("weight_map")
-                    if weight_map is None or not isinstance(weight_map, dict):
-                        raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
-                    self.tensor_names.update(weight_map.keys())
-            else:
-                self.tensor_names = tensor_names_from_parts
-                weight_map = {}
+        if index_file.is_file():
+            self.tensor_names = set()
+            logger.info(f"gguf: loading model weight map from '{index_name}'")
+            with open(index_file, "r", encoding="utf-8") as f:
+                index: dict[str, Any] = json.load(f)
+                weight_map = index.get("weight_map")
+                if weight_map is None or not isinstance(weight_map, dict):
+                    raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
+                self.tensor_names.update(weight_map.keys())
        else:
            self.tensor_names = tensor_names_from_parts
            weight_map = {}
@@ -441,12 +426,7 @@ class ModelBase:
        return part_names

    @staticmethod
-    def load_hparams(dir_model: Path, is_mistral_format: bool):
-        if is_mistral_format:
-            with open(dir_model / "params.json", "r", encoding="utf-8") as f:
-                config = json.load(f)
-            return config
-
+    def load_hparams(dir_model: Path):
        try:
            # for security reason, we don't allow loading remote code by default
            # if a model need remote code, we will fallback to config.json
@@ -496,10 +476,7 @@ class TextModel(ModelBase):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        if not self.is_mistral_format:
-            self.hf_arch = get_model_architecture(self.hparams, self.model_type)
-        else:
-            self.hf_arch = ""
+        self.hf_arch = get_model_architecture(self.hparams, self.model_type)

        if "text_config" in self.hparams:
            # move the text_config to the root level
@@ -565,14 +542,14 @@ class TextModel(ModelBase):
            self.gguf_writer.add_head_count(n_head)
            logger.info(f"gguf: head count = {n_head}")

-        if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None:
+        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)
            logger.info(f"gguf: key-value head count = {n_head_kv}")

        if (rope_theta := self.hparams.get("rope_theta")) is not None:
            self.gguf_writer.add_rope_freq_base(rope_theta)
            logger.info(f"gguf: rope theta = {rope_theta}")
-        if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
+        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
            logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
@@ -1233,19 +1210,12 @@ class MmprojModel(ModelBase):
            raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")

        # get n_embd of the text model
-        if not self.is_mistral_format:
-            if "text_config" not in self.hparams:
-                self.hparams["text_config"] = {}
-            if "audio_config" not in self.hparams:
-                self.hparams["audio_config"] = {}
-            text_config = {**self.hparams, **self.hparams["text_config"]}
-            self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
-        else:
-            text_config = {
-                k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
-            }
-            self.n_embd_text = text_config.get("hidden_dim", 0)
-
+        if "text_config" not in self.hparams:
+            self.hparams["text_config"] = {}
+        if "audio_config" not in self.hparams:
+            self.hparams["audio_config"] = {}
+        text_config = {**self.hparams, **self.hparams["text_config"]}
+        self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
        assert self.n_embd_text > 0, "n_embd not found in hparams"

        # move vision config to the top level, while preserving the original hparams in global_config
@@ -1266,13 +1236,11 @@ class MmprojModel(ModelBase):
        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)

        # load preprocessor config
-        if not self.is_mistral_format:
-            with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
-                self.preprocessor_config = json.load(f)
+        with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
+            self.preprocessor_config = json.load(f)

    def get_vision_config(self) -> dict[str, Any] | None:
-        config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
-        return self.global_config.get(config_name)
+        return self.global_config.get("vision_config")

    def get_audio_config(self) -> dict[str, Any] | None:
        return self.global_config.get("audio_config")
@@ -1296,11 +1264,8 @@ class MmprojModel(ModelBase):
            self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))

            # preprocessor config
-            image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
-            image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
-
-            self.gguf_writer.add_vision_image_mean(image_mean)
-            self.gguf_writer.add_vision_image_std(image_std)
+            self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
+            self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])

        if self.has_audio_encoder:
            self.gguf_writer.add_clip_has_audio_encoder(True)
@@ -1959,63 +1924,11 @@ class LlamaModel(TextModel):
        if self.hf_arch == "VLlama3ForCausalLM":
            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)

-    def _set_vocab_mistral(self):
-        vocab = MistralVocab(self.dir_model)
-        logger.info(
-            f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
-        )
-
-        self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
-
-        tokens = []
-        scores = []
-        toktypes = []
-
-        for text, score, toktype in vocab.all_tokens():
-            tokens.append(text)
-            scores.append(score)
-            toktypes.append(toktype)
-
-        assert len(tokens) == vocab.vocab_size, (
-            f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
-        )
-
-        if vocab.tokenizer_type == MistralTokenizerType.tekken:
-            self.gguf_writer.add_tokenizer_pre("tekken")
-            self.gguf_writer.add_token_merges(
-                vocab.extract_vocab_merges_from_model()
-            )
-
-        logger.info(
-            f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
-        )
-
-        self.gguf_writer.add_bos_token_id(vocab.bos_id)
-        self.gguf_writer.add_eos_token_id(vocab.eos_id)
-        self.gguf_writer.add_unk_token_id(vocab.unk_id)
-        self.gguf_writer.add_pad_token_id(vocab.pad_id)
-
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-        self.gguf_writer.add_vocab_size(vocab.vocab_size)
-
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(False)
-
-        template_dir = Path(__file__).parent / "models/templates/"
-
-        template = MistralModel.get_community_chat_template(vocab, template_dir)
-        self.gguf_writer.add_chat_template(template)
-
    def set_vocab(self):
-        if self.is_mistral_format:
-            return self._set_vocab_mistral()
-
        path_tekken_json = self.dir_model / "tekken.json"
        path_tokenizer_json = self.dir_model / "tokenizer.json"
        if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
-            self._set_vocab_mistral()
+            return self.set_vocab_tekken()

        try:
            self._set_vocab_sentencepiece()
@@ -2049,12 +1962,56 @@ class LlamaModel(TextModel):
        if self.hparams.get("vocab_size", 32000) == 49152:
            self.gguf_writer.add_add_bos_token(False)

+    def set_vocab_tekken(self):
+        vocab = gguf.vocab.MistralVocab(self.dir_model)
+        self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
+
+        tokens = []
+        scores = []
+        toktypes = []
+
+        for text, score, toktype in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        assert len(tokens) == vocab.vocab_size, (
+            f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
+        )
+
+        if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
+            self.gguf_writer.add_tokenizer_pre("tekken")
+            self.gguf_writer.add_token_merges(
+                vocab.extract_vocab_merges_from_model()
+            )
+
+        logger.info(
+            f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
+        )
+
+        self.gguf_writer.add_bos_token_id(vocab.bos_id)
+        self.gguf_writer.add_eos_token_id(vocab.eos_id)
+        self.gguf_writer.add_unk_token_id(vocab.unk_id)
+        self.gguf_writer.add_pad_token_id(vocab.pad_id)
+
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_vocab_size(vocab.vocab_size)
+
+        self.gguf_writer.add_add_bos_token(True)
+        self.gguf_writer.add_add_eos_token(False)
+
+        script_dir = Path(__file__).parent
+        template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
+        with open(template_path, "r", encoding="utf-8") as f:
+            template = f.read()
+            self.gguf_writer.add_chat_template(template)
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-
-        if not self.is_mistral_format:
-            self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])

        if (rope_dim := hparams.get("head_dim")) is None:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
@@ -2076,25 +2033,13 @@ class LlamaModel(TextModel):
    _experts: list[dict[str, Tensor]] | None = None

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.find_hparam(["n_heads", "num_attention_heads"])
-        n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
-
-        vision_prefixes = [
-            "vision_encoder.",
-            "vision_language_adapter.",
-            "patch_merger.",
-            "pre_mm_projector_norm",
-        ]
-
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
        is_multimodal_tensor = "vision_tower" in name \
            or "vision_model" in name \
            or "audio_tower" in name \
            or "model.connector" in name \
-            or "multi_modal_projector" in name \
-            or any(
-                name.startswith(prefix)
-                for prefix in vision_prefixes
-            )
+            or "multi_modal_projector" in name

        if is_multimodal_tensor:
            return [] # skip vision tensors
@@ -2210,18 +2155,13 @@ class LlavaVisionModel(MmprojModel):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        if self.hparams.get("model_type") == "pixtral":
+        if self.hparams["model_type"] == "pixtral":
            # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
            self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
            self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
-        elif self.is_mistral_format:
-            # hparams is already vision config here so norm_eps is only defined in global_config.
-            self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
-            assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
-            self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
+            logger.info(f"Image break token id: {self.img_break_tok_id}")
        else:
            raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
-        logger.info(f"Image break token id: {self.img_break_tok_id}")

    def get_token_id(self, token: str) -> int:
        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
@@ -2235,7 +2175,7 @@ class LlavaVisionModel(MmprojModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        if hparams.get("model_type") == "pixtral":
+        if hparams["model_type"] == "pixtral":
            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
            self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])

@@ -2253,30 +2193,18 @@ class LlavaVisionModel(MmprojModel):

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
-        n_head = (
-            self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"])
-        )
+        n_head = self.hparams["num_attention_heads"]
        n_kv_head = n_head

-        valid_prefixes = (
-            "multi_modal_projector.",
-            "vision_tower.",
-            "vision_encoder.",
-            "vision_language_adapter.",
-            "patch_merger.",
-            "pre_mm_projector_norm",
-        )
-
-        if any(name.startswith(prefix) for prefix in valid_prefixes):
+        if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
            # process vision tensors
-            if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format:
+            if name.endswith(("q_proj.weight", "q_proj.bias")):
                data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-            if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format:
+            if name.endswith(("k_proj.weight", "k_proj.bias")):
                data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
            return [(self.map_tensor_name(name), data_torch)]

-        embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight"
-        if self.img_break_tok_id > 0 and embed_key in name:
+        if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
            logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
            # for pixtral model, we need to extract the [IMG_BREAK] token embedding
            img_break_embd = data_torch[self.img_break_tok_id]
@@ -3400,13 +3328,7 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
@ModelBase.register("InternVisionModel")
 class InternVisionModel(MmprojModel):
    def set_gguf_parameters(self):
-        assert self.hparams_vision is not None
-        if isinstance(self.hparams_vision['image_size'], list):
-            self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0]
-        if isinstance(self.hparams_vision['patch_size'], list):
-            self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0]
        super().set_gguf_parameters()
-
        hparams = self.hparams
        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
@@ -3430,30 +3352,14 @@ class InternVisionModel(MmprojModel):
            return gguf.GGMLQuantizationType.F32
        return False

-    def _mapping_interns1_name(self, name):
-        names_map = {
-            "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
-            "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
-            "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
-            "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
-            "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
-            "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
-        }
-        if name in names_map:
-            name = names_map[name]
-        return name
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
-        vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector']
-        # deal with intern-s1 special case
-        name = self._mapping_interns1_name(name)
-        if any([name.startswith(prefix) for prefix in vision_prefix]):
+        if name.startswith("vision_model") or name.startswith("mlp"):
            # process visual tensors
            # correct name
            if name.startswith("vision_model"):
                name = "vision_tower." + name
-            if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"):
+            if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
                name += ".weight"
            # split QKV tensors if needed
            if ".qkv." in name:
@@ -3539,10 +3445,6 @@ class Qwen2MoeModel(TextModel):

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # process the experts separately
-        name = name.replace("language_model.", "") # InternVL
-        if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
-            # skip visual tensors
-            return []
        if name.find("experts") != -1:
            n_experts = self.hparams["num_experts"]
            assert bid is not None
@@ -3596,85 +3498,6 @@ class Qwen3Model(Qwen2Model):
 class Qwen3MoeModel(Qwen2MoeModel):
    model_arch = gguf.MODEL_ARCH.QWEN3MOE

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        hparams = ModelBase.load_hparams(self.dir_model, False)
-        self.origin_hf_arch = hparams.get('architectures', [None])[0]
-
-    def set_vocab(self):
-        # deal with intern-s1
-        if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
-            self._set_vocab_interns1()
-            return
-
-        try:
-            self._set_vocab_sentencepiece()
-        except FileNotFoundError:
-            self._set_vocab_gpt2()
-
-    def _set_vocab_interns1(self):
-        tokens: list[str] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
-        vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
-        vocab_size = self.hparams.get("vocab_size", len(vocab))
-        assert max(vocab.values()) < vocab_size
-
-        tokpre = self.get_vocab_base_pre(tokenizer)
-
-        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
-        added_vocab = tokenizer.get_added_vocab()
-
-        added_tokens_decoder = tokenizer.added_tokens_decoder
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                tokens.append(f"[PAD{i}]")
-                toktypes.append(gguf.TokenType.UNUSED)
-            else:
-                token: str = reverse_vocab[i]
-                if token in added_vocab:
-                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
-                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
-                    if not added_tokens_decoder[i].normalized:
-                        previous_token = token
-                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
-                        if previous_token != token:
-                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
-
-                    if added_tokens_decoder[i].special or self.does_token_look_special(token):
-                        toktypes.append(gguf.TokenType.CONTROL)
-                    else:
-                        toktypes.append(gguf.TokenType.USER_DEFINED)
-                else:
-                    toktypes.append(gguf.TokenType.NORMAL)
-                tokens.append(token)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_tokenizer_pre(tokpre)
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
-        special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
-        additional_special_tokens = []
-        if special_tokens_map_file.is_file():
-            with open(special_tokens_map_file, encoding = 'utf-8') as f:
-                additional_special_tokens = json.load(f).get('additional_special_tokens', [])
-        tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
-        if tokenizer_cfg_file.is_file():
-            with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
-                added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
-                token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
-                for token in additional_special_tokens:
-                    if token in token2ids_map:
-                        special_vocab._set_special_token(token, token2ids_map[token])
-        special_vocab._set_special_token('eos', 151645)
-        special_vocab._set_special_token("bos", 151643)
-        special_vocab.add_to_gguf(self.gguf_writer)
-

@ModelBase.register("GPT2LMHeadModel")
 class GPT2Model(TextModel):
@@ -4755,7 +4578,7 @@ class NomicBertModel(BertModel):
    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
        hparams = kwargs.pop("hparams", None)
        if hparams is None:
-            hparams = ModelBase.load_hparams(dir_model, False)
+            hparams = ModelBase.load_hparams(dir_model)

        self.is_moe = bool(hparams.get("moe_every_n_layers"))
        self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
@@ -8127,130 +7950,6 @@ class SmolLM3Model(LlamaModel):
            self.gguf_writer.add_chat_template(chat_template)


-@ModelBase.register("GptOssForCausalLM")
-class GptOssModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.GPT_OSS
-
-    def transform_nibble_layout(self, tensor):
-        assert tensor.dtype == torch.uint8
-        assert tensor.shape[-1] == 16
-        # swap nibbles
-        t_lo = tensor & 0x0F
-        t_hi = tensor & 0xF0
-        t_swapped = (t_lo << 4) | (t_hi >> 4)
-        tensor = t_swapped
-        # transform aaaa...bbbb... to abababab...
-        blk_a, blk_b = tensor.chunk(2, dim=-1)
-        # get a_
-        blk_a0 = (blk_a & 0xF0).view(-1, 1)
-        blk_a1 = (blk_a << 4).view(-1, 1)
-        blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape)
-        # get _b
-        blk_b0 = (blk_b >> 4).view(-1, 1)
-        blk_b1 = (blk_b & 0x0F).view(-1, 1)
-        blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape)
-        # swap once more
-        out = blk_a | blk_b
-        out_h = out & 0xF0
-        out_l = out & 0x0F
-        out = (out_h >> 4) | (out_l << 4)
-        return out
-
-    def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
-        assert blocks.dtype == torch.uint8
-        assert scales.dtype == torch.uint8
-        scales = scales.unsqueeze(-1)
-        assert len(blocks.shape) == 4
-        assert len(scales.shape) == 4
-        blocks = self.transform_nibble_layout(blocks)
-        new_data = torch.concat((scales, blocks), dim=-1)
-        new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32]
-        logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4")
-        # flatten last dim
-        new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3])
-        new_data = new_data.numpy()
-        self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        blocks0: Tensor = torch.zeros(1)
-        blocks1: Tensor = torch.zeros(1)
-        # we assume that tensors are loaded in the correct order
-        for name, data_torch in self.get_tensors():
-            if "mlp.experts.down_proj_blocks" in name:
-                blocks0 = data_torch
-            elif "mlp.experts.down_proj_scales" in name:
-                new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
-                self.repack_mxfp4(new_name, blocks0, data_torch)
-            elif "mlp.experts.gate_up_proj_blocks" in name:
-                blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
-            elif "mlp.experts.gate_up_proj_scales" in name:
-                scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :]
-                new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight"))
-                new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
-                self.repack_mxfp4(new_name_gate, blocks0, scales0)
-                self.repack_mxfp4(new_name_up, blocks1, scales1)
-        return []
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        if "sinks" in name:
-            name += ".weight"
-
-        # correct naming for down_proj
-        if "down_proj" in name:
-            if name.endswith("_bias"):
-                name = name.replace("down_proj_bias", "down_proj.bias")
-            elif "_blocks" not in name and "_scales" not in name:
-                logger.warning(f"{name} is not in MXFP4, performance may be degraded")
-                name = name.replace("down_proj", "down_proj.weight")
-                data_torch = data_torch.transpose(-1, -2)
-            else:
-                # otherwise, it should already be repacked to ggml MXFP4 format
-                return []
-
-        # split the gate_up into gate and up
-        if "gate_up_proj" in name:
-            if name.endswith("_bias"):
-                name_up = name.replace("gate_up_proj_bias", "up_proj.bias")
-                name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias")
-                gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2]
-                return [
-                    (self.map_tensor_name(name_gate), gate_proj_bias),
-                    (self.map_tensor_name(name_up), up_proj_bias)
-                ]
-            elif "_blocks" not in name and "_scales" not in name:
-                logger.warning(f"{name} is not in MXFP4, performance may be degraded")
-                name_up = name.replace("gate_up_proj", "up_proj.weight")
-                name_gate = name.replace("gate_up_proj", "gate_proj.weight")
-                data_torch = data_torch.transpose(-1, -2)
-                gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
-                return [
-                    (self.map_tensor_name(name_gate), gate_proj_weight),
-                    (self.map_tensor_name(name_up), up_proj_weight)
-                ]
-            else:
-                # otherwise, it should already be repacked to ggml MXFP4 format
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
-        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
-
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        rope_type = rope_scaling.get("rope_type", rope_scaling.get("type"))
-        assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}"
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-        self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-        self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
-
-
@ModelBase.register("Lfm2ForCausalLM")
@ModelBase.register("LFM2ForCausalLM")
 class LFM2Model(TextModel):
@@ -8376,77 +8075,6 @@ class SmallThinkerModel(TextModel):
            if len(experts) > 0:
                raise ValueError(f"Unprocessed experts: {experts}")

-
-class MistralModel(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.LLAMA
-    model_name = "Mistral"
-    hf_arch = ""
-    is_mistral_format = True
-    undo_permute = False
-
-    @staticmethod
-    def get_community_chat_template(vocab: MistralVocab, templates_dir: Path):
-        assert TokenizerVersion is not None, "mistral_common is not installed"
-        assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
-            f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
-        )
-
-        if vocab.tokenizer.version == TokenizerVersion.v1:
-            return "mistral-v1"
-        elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm:
-            return "mistral-v3"
-        elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken:
-            return "mistral-v3-tekken"
-        elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm:
-            return "mistral-v7"
-        elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken:
-            return "mistral-v7-tekken"
-        elif vocab.tokenizer.version == TokenizerVersion.v11:
-            template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja"
-        elif vocab.tokenizer.version == TokenizerVersion.v13:
-            template_file = "unsloth-mistral-Devstral-Small-2507.jinja"
-        else:
-            raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}")
-
-        template_path = templates_dir / template_file
-        if not template_path.exists():
-            raise FileNotFoundError(f"Template file not found: {template_path}")
-
-        with open(template_path, "r", encoding="utf-8") as f:
-            template = f.read()
-
-        return template
-
-
-class PixtralModel(LlavaVisionModel):
-    model_name = "Pixtral"
-    hf_arch = ""
-    is_mistral_format = True
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
-
-        self.gguf_writer.add_vision_attention_layernorm_eps(
-            self.find_hparam(["norm_eps"])
-        )
-        self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"]))
-
-        self.gguf_writer.add_vision_use_silu(True)
-
-        # spatial_merge_size
-        if self.find_vparam(["mm_projector_id"]) == "patch_merge":
-            self.gguf_writer.add_vision_spatial_merge_size(
-                self.find_vparam(["spatial_merge_size"])
-            )
-
-    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
-        if name == "vision_language_adapter.w_in.weight":
-            return "mm.1.weight"
-        elif name == "vision_language_adapter.w_out.weight":
-            return "mm.2.weight"
-        return super().map_tensor_name(name, try_suffixes)
-
 ###### CONVERSION LOGIC ######


@@ -8461,7 +8089,6 @@ class LazyTorchTensor(gguf.LazyBase):
    _dtype_map: dict[torch.dtype, type] = {
        torch.float16: np.float16,
        torch.float32: np.float32,
-        torch.uint8: np.uint8,
    }

    # used for safetensors slices
@@ -8597,10 +8224,6 @@ def parse_args() -> argparse.Namespace:
        "--mmproj", action="store_true",
        help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
    )
-    parser.add_argument(
-        "--mistral-format", action="store_true",
-        help="Whether the model is stored following the Mistral format.",
-    )

    args = parser.parse_args()
    if not args.print_supported_models and args.model is None:
@@ -8706,25 +8329,17 @@ def main() -> None:
        if "mmproj" not in fname_out.name:
            fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")

-    is_mistral_format = args.mistral_format
-
    with torch.inference_mode():
        output_type = ftype_map[args.outtype]
        model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
-        hparams = ModelBase.load_hparams(dir_model, is_mistral_format)
-        if not is_mistral_format:
-            model_architecture = get_model_architecture(hparams, model_type)
-            logger.info(f"Model architecture: {model_architecture}")
-            try:
-                model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
-            except NotImplementedError:
-                logger.error(f"Model {model_architecture} is not supported")
-                sys.exit(1)
-        elif args.mmproj:
-            assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
-            model_class = PixtralModel
-        else:
-            model_class = MistralModel
+        hparams = ModelBase.load_hparams(dir_model)
+        model_architecture = get_model_architecture(hparams, model_type)
+        logger.info(f"Model architecture: {model_architecture}")
+        try:
+            model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
+        except NotImplementedError:
+            logger.error(f"Model {model_architecture} is not supported")
+            sys.exit(1)

        model_instance = model_class(dir_model, output_type, fname_out,
                                     is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
@@ -8733,8 +8348,7 @@ def main() -> None:
                                     split_max_tensors=args.split_max_tensors,
                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
                                     small_first_shard=args.no_tensor_first_split,
-                                     remote_hf_model_id=hf_repo_id,
-                                     )
+                                     remote_hf_model_id=hf_repo_id)

        if args.vocab_only:
            logger.info("Exporting model vocab...")
@@ -340,7 +340,7 @@ if __name__ == '__main__':
            sys.exit(1)
    else:
        logger.info(f"Loading base model: {dir_base_model.name}")
-        hparams = ModelBase.load_hparams(dir_base_model, False)
+        hparams = ModelBase.load_hparams(dir_base_model)

    with torch.inference_mode():
        try:
@@ -76,23 +76,6 @@ cmake --build build --config Release -j $(nproc)
    cmake --build build --config Release -j $(nproc)
    ```

-## IBM zDNN Accelerator
-
-This provides acceleration using the IBM zAIU co-processor located in the Telum I and Telum II processors. Make sure to have the [IBM zDNN library](https://github.com/IBM/zDNN) installed.
-
-#### Compile from source from IBM
-
-You may find the official build instructions here: [Building and Installing zDNN](https://github.com/IBM/zDNN?tab=readme-ov-file#building-and-installing-zdnn)
-
-### Compilation
-
-```bash
-cmake -S . -B build             \
-    -DCMAKE_BUILD_TYPE=Release  \
-    -DGGML_ZDNN=ON
-cmake --build build --config Release -j$(nproc)
-```
-
 ## Getting GGUF Models

 All models need to be converted to Big-Endian. You can achieve this in three cases:
@@ -162,15 +145,15 @@ All models need to be converted to Big-Endian. You can achieve this in three cas

 ### 1. SIMD Acceleration

-Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.

 ### 2. NNPA Vector Intrinsics Acceleration

-Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.

-### 3. zDNN Accelerator (WIP)
+### 3. zDNN Accelerator

-Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
+_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._

 ### 4. Spyre Accelerator

@@ -246,12 +229,11 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl

 ## Appendix A: Hardware Support Matrix

-|          | Support | Minimum Compiler Version |
-| -------- | ------- | ------------------------ |
-| IBM z15  | ✅      |                          |
-| IBM z16  | ✅      |                          |
-| IBM z17  | ✅      | GCC 15.1.0               |
-| IBM zDNN | ✅      |                          |
+|         | Support | Minimum Compiler Version |
+| ------- | ------- | ------------------------ |
+| IBM z15 | ✅      |                          |
+| IBM z16 | ✅      |                          |
+| IBM z17 | ✅      | GCC 15.1.0               |

 -   ✅ - supported and verified to run as intended
 -   🚫 - unsupported, we are unlikely able to provide support
@@ -260,7 +242,7 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl

 |            | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
 | ---------- | ----------- | ---- | ---- | ----- |
-| FP32       | ✅          | ✅   | ✅   | ❓    |
+| FP32       | ✅          | ✅   | ❓   | ❓    |
 | FP16       | ✅          | ✅   | ❓   | ❓    |
 | BF16       | 🚫          | 🚫   | ❓   | ❓    |
 | Q4_0       | ✅          | ✅   | ❓   | ❓    |
@@ -291,4 +273,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself

-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 25, 2025.
@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta

 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggml-org/llama.cpp
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```

@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta

 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggml-org/llama.cpp
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```

@@ -12,92 +12,91 @@ Legend:
 - 🟡 Partially supported by this backend
 - ❌ Not supported by this backend

-| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
-|-----------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
-|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
-|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
-|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan |
+|-----------|------|------|------|------|------|------|------|------|
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ |
+|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 |
+|                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ |
+|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
+|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 |
+|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
+|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 |
+|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
+|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
+|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
+|                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 |
+|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
+|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
+|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                              SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
+|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ |
+|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ |
+|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 |
+|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ |
@@ -7,7 +7,6 @@
 #include <cstdio>
 #include <string>
 #include <vector>
-#include <numeric>

 /**
 * This the arbitrary data which will be passed to each callback.
@@ -78,12 +77,6 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
        LOG("                                     ]\n");
        LOG("                                     sum = %f\n", sum);
    }
-
-    // TODO: make this abort configurable/optional?
-    if (std::isnan(sum)) {
-        LOG_ERR("encountered NaN - aborting\n");
-        exit(0);
-    }
 }

 /**
@@ -59,8 +59,6 @@ int main(int argc, char ** argv) {
    }

    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
-    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;
-
    common_init_result llama_init_dft = common_init_from_params(params);

    //model_dft = llama_init_dft.model.get();
@@ -85,8 +85,6 @@ int main(int argc, char ** argv) {
    }

    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
-    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;
-
    common_init_result llama_init_dft = common_init_from_params(params);

    model_dft = llama_init_dft.model.get();
@@ -10,20 +10,20 @@
 #include <vector>

 #if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267)  // possible loss of data
+#pragma warning(disable: 4244 4267) // possible loss of data
 #endif

 int main(int argc, char ** argv) {
    common_params params;
+
    params.escape = false;

-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
        return 1;
    }

    if (params.use_mmap) {
-        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
-                __func__);
+        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
        params.use_mmap = false;
    }
    if (params.cache_type_k != GGML_TYPE_F32) {
@@ -38,10 +38,11 @@ int main(int argc, char ** argv) {
    common_init();
    llama_backend_init();
    llama_numa_init(params.numa);
+
    // load the model and apply lora adapter, if any
-    common_init_result   llama_init = common_init_from_params(params);
-    llama_model_ptr    & model      = llama_init.model;
-    llama_context_ptr  & ctx        = llama_init.context;
+    common_init_result llama_init = common_init_from_params(params);
+    llama_model_ptr   & model = llama_init.model;
+    llama_context_ptr & ctx   = llama_init.context;

    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n", __func__);
@@ -54,32 +55,31 @@ int main(int argc, char ** argv) {
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }

-    std::vector<llama_token> tokens  = common_tokenize(ctx.get(), params.prompt, true);
-    ggml_opt_dataset_t       dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);
+    constexpr float val_split = 0.05f;

-    struct lr_opt & lr = params.lr;
-    LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
-            ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.decay_epochs,
-            (unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split);
+    std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
+    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);

-    struct llama_opt_params lopt_params{
-        /*n_ctx_train     =*/0,
-        /*param_filter    =*/llama_opt_param_filter_all,
-        /*param_filter_ud =*/nullptr,
-        /*get_opt_pars    =*/common_opt_lr_pars,
-        /*get_opt_pars_ud =*/&params.lr,
-        /*optimizer_type  =*/params.optimizer,
+    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
+    optimizer_params.adamw.alpha = 1e-7f; // learning rate
+
+    struct llama_opt_params lopt_params {
+        /*n_ctx_train     =*/ 0,
+        /*param_filter    =*/ llama_opt_param_filter_all,
+        /*param_filter_ud =*/ nullptr,
+        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
+        /*get_opt_pars_ud =*/ &optimizer_params,
    };
    llama_opt_init(ctx.get(), model.get(), lopt_params);

-    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
+    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);

    ggml_opt_result_t result_train = ggml_opt_result_init();
    ggml_opt_result_t result_eval  = ggml_opt_result_init();

-    for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
+    for (int epoch = 0; epoch < 2; ++epoch) {
        llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
-                        ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
        fprintf(stderr, "\n");

        ggml_opt_result_reset(result_train);
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
    ggml_opt_result_free(result_train);
    ggml_opt_result_free(result_eval);

-    llama_model_save_to_file(model.get(), params.out_file.c_str());
+    llama_model_save_to_file(model.get(), "finetuned-model.gguf");

    llama_backend_free();

@@ -36,6 +36,9 @@
  # ```
  # nixConfig = {
  #   extra-substituters = [
+  #     # Populated by the CI in ggml-org/llama.cpp
+  #     "https://llama-cpp.cachix.org"
+  #
  #     # A development cache for nixpkgs imported with `config.cudaSupport = true`.
  #     # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
  #     # This lets one skip building e.g. the CUDA-enabled openmpi.
@@ -44,8 +47,10 @@
  #   ];
  #
  #   # Verify these are the same keys as published on
+  #   # - https://app.cachix.org/cache/llama-cpp
  #   # - https://app.cachix.org/cache/cuda-maintainers
  #   extra-trusted-public-keys = [
+  #     "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
  #     "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  #   ];
  # };
@@ -176,7 +176,6 @@ option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
 option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
 option(GGML_HIP_MMQ_MFMA                    "ggml: enable MFMA MMA for CDNA in MMQ"           ON)
-option(GGML_HIP_EXPORT_METRICS              "ggml: enable kernel perf metrics output"         OFF)
 option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
 option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
@@ -188,7 +187,6 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
-option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
@@ -106,7 +106,7 @@ if(NOT TARGET ggml::ggml)

    find_library(GGML_LIBRARY ggml
        REQUIRED
-        HINTS ${GGML_LIB_DIR}
+        HINTS ${GGML_LIB_DIR} ${GGML_BACKEND_DIR}
        NO_CMAKE_FIND_ROOT_PATH)

    add_library(ggml::ggml UNKNOWN IMPORTED)
@@ -125,56 +125,54 @@ if(NOT TARGET ggml::ggml)
            IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")

    set(_ggml_all_targets "")
-    if (NOT GGML_BACKEND_DL)
-        foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
-            string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
-            string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
+    foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
+        string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
+        string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)

-            find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
-                REQUIRED
-                HINTS ${GGML_LIB_DIR}
-                NO_CMAKE_FIND_ROOT_PATH)
+        find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
+            REQUIRED
+            HINTS ${GGML_LIB_DIR}
+            NO_CMAKE_FIND_ROOT_PATH)

-            message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
+        message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")

-            add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
+        add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
+        set_target_properties(ggml::${_ggml_backend}
+            PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
+                IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+                IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
+                INTERFACE_COMPILE_FEATURES c_std_90
+                POSITION_INDEPENDENT_CODE ON)
+
+        string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
+        if(is_cpu_variant)
+            list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
            set_target_properties(ggml::${_ggml_backend}
-                PROPERTIES
-                    INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
-                    IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-                    IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
-                    INTERFACE_COMPILE_FEATURES c_std_90
-                    POSITION_INDEPENDENT_CODE ON)
+            PROPERTIES
+                INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")

-            string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
-            if(is_cpu_variant)
-                list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
-                set_target_properties(ggml::${_ggml_backend}
-                PROPERTIES
-                    INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
-
-                if(GGML_CPU_INTERFACE_LINK_OPTIONS)
-                    set_target_properties(ggml::${_ggml_backend}
-                        PROPERTIES
-                            INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
-                endif()
-
-            else()
-                list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
+            if(GGML_CPU_INTERFACE_LINK_OPTIONS)
                set_target_properties(ggml::${_ggml_backend}
                    PROPERTIES
-                        INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
-
-                if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
-                    set_target_properties(ggml::${_ggml_backend}
-                        PROPERTIES
-                            INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
-                endif()
+                        INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
            endif()

-            list(APPEND _ggml_all_targets ggml::${_ggml_backend})
-        endforeach()
-    endif()
+        else()
+            list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
+            set_target_properties(ggml::${_ggml_backend}
+                PROPERTIES
+                    INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
+
+            if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
+                set_target_properties(ggml::${_ggml_backend}
+                    PROPERTIES
+                        INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
+            endif()
+        endif()
+
+        list(APPEND _ggml_all_targets ggml::${_ggml_backend})
+    endforeach()

    list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
    set_target_properties(ggml::ggml
@@ -74,26 +74,16 @@ extern "C" {
        GGML_OPT_BUILD_TYPE_OPT     = 30,
    };

-    enum ggml_opt_optimizer_type {
-        GGML_OPT_OPTIMIZER_TYPE_ADAMW,
-        GGML_OPT_OPTIMIZER_TYPE_SGD,
-
-        GGML_OPT_OPTIMIZER_TYPE_COUNT
-    };
-
    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
    struct ggml_opt_optimizer_params {
+        // AdamW optimizer parameters
        struct {
            float alpha; // learning rate
-            float beta1; // first AdamW momentum
-            float beta2; // second AdamW momentum
+            float beta1;
+            float beta2;
            float eps;   // epsilon for numerical stability
-            float wd;    // weight decay - 0.0f to disable
+            float wd;    // weight decay for AdamW, use 0.0f to disable
        } adamw;
-        struct {
-            float alpha; // learning rate
-            float wd;    // weight decay
-        } sgd;
    };

    // callback to calculate optimizer parameters prior to a backward pass
@@ -122,11 +112,8 @@ extern "C" {

        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done

-        ggml_opt_get_optimizer_params get_opt_pars;    // callback for calculating optimizer parameters
-        void *                        get_opt_pars_ud; // userdata for calculating optimizer parameters
-
-        // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
-        enum ggml_opt_optimizer_type optimizer;
+        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
    };

    // get parameters for an optimization context with defaults set where possible
@@ -155,10 +142,6 @@ extern "C" {
    // get the gradient accumulator for a node from the forward graph
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);

-    GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
-
-    GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
-
    // ====== Optimization Result ======

    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
@@ -243,14 +226,12 @@ extern "C" {
            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
-            enum ggml_opt_optimizer_type    optimizer,      // sgd or adamw
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
            int64_t                         nepoch,         // how many times the dataset should be iterated over
            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
            bool                            silent);        // whether or not info prints to stderr should be suppressed

-
 #ifdef  __cplusplus
 }
 #endif
@@ -1,16 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
-
-#ifdef __cplusplus
-}
-#endif
@@ -241,8 +241,6 @@
 #define GGML_ROPE_TYPE_MROPE  8
 #define GGML_ROPE_TYPE_VISION 24

-#define GGML_MROPE_SECTIONS   4
-
 #define GGML_UNUSED(x) (void)(x)

 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -306,16 +304,6 @@
    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)

-#define GGML_TENSOR_TERNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb2, src2, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
 #define GGML_TENSOR_BINARY_OP_LOCALS01 \
    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
@@ -407,8 +395,7 @@ extern "C" {
        // GGML_TYPE_IQ4_NL_4_4 = 36,
        // GGML_TYPE_IQ4_NL_4_8 = 37,
        // GGML_TYPE_IQ4_NL_8_8 = 38,
-        GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
-        GGML_TYPE_COUNT   = 40,
+        GGML_TYPE_COUNT   = 39,
    };

    // precision
@@ -443,7 +430,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-        GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
    };

    // available tensor operations:
@@ -452,7 +438,6 @@ extern "C" {

        GGML_OP_DUP,
        GGML_OP_ADD,
-        GGML_OP_ADD_ID,
        GGML_OP_ADD1,
        GGML_OP_ACC,
        GGML_OP_SUB,
@@ -542,7 +527,6 @@ extern "C" {
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
        GGML_OP_OPT_STEP_ADAMW,
-        GGML_OP_OPT_STEP_SGD,

        GGML_OP_GLU,

@@ -573,7 +557,6 @@ extern "C" {
        GGML_GLU_OP_REGLU,
        GGML_GLU_OP_GEGLU,
        GGML_GLU_OP_SWIGLU,
-        GGML_GLU_OP_SWIGLU_OAI,
        GGML_GLU_OP_GEGLU_ERF,
        GGML_GLU_OP_GEGLU_QUICK,

@@ -848,13 +831,6 @@ extern "C" {
            struct ggml_tensor  * b,
            enum   ggml_type      type);

-    // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
-    GGML_API struct ggml_tensor * ggml_add_id(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * ids);
-
    GGML_API struct ggml_tensor * ggml_add1(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1222,13 +1198,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    GGML_API struct ggml_tensor * ggml_swiglu_oai(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            float                 alpha,
-            float                 limit);
-
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@@ -1601,10 +1570,6 @@ extern "C" {
            float                 scale,
            float                 max_bias);

-    GGML_API void ggml_soft_max_add_sinks(
-            struct ggml_tensor * a,
-            struct ggml_tensor * sinks);
-
    GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1663,7 +1628,7 @@ extern "C" {
            struct ggml_tensor  * b,
            struct ggml_tensor  * c,
            int                   n_dims,
-            int                   sections[GGML_MROPE_SECTIONS],
+            int                   sections[4],
            int                   mode,
            int                   n_ctx_orig,
            float                 freq_base,
@@ -1689,22 +1654,6 @@ extern "C" {
            float                 beta_fast,
            float                 beta_slow);

-    GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * c,
-            int                   n_dims,
-            int                   sections[GGML_MROPE_SECTIONS],
-            int                   mode,
-            int                   n_ctx_orig,
-            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow);
-
    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -2103,10 +2052,6 @@ extern "C" {
    GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
            const struct ggml_tensor * a);

-    GGML_API void ggml_flash_attn_ext_add_sinks(
-            struct ggml_tensor * a,
-            struct ggml_tensor * sinks);
-
    // TODO: needs to be adapted to ggml_flash_attn_ext
    GGML_API struct ggml_tensor * ggml_flash_attn_back(
           struct ggml_context * ctx,
@@ -2312,14 +2257,7 @@ extern "C" {
            struct ggml_tensor  * grad,
            struct ggml_tensor  * m,
            struct ggml_tensor  * v,
-            struct ggml_tensor  * adamw_params); // parameters such as the learning rate
-
-    // stochastic gradient descent step (with weight decay)
-    GGML_API struct ggml_tensor * ggml_opt_step_sgd(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  grad,
-        struct ggml_tensor *  sgd_params); // alpha, weight decay
+            struct ggml_tensor  * adamw_params); // parameters such a the learning rate

    //
    // automatic differentiation
@@ -382,7 +382,6 @@ ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
-ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)

 foreach (target ggml-base ggml)
@@ -29,7 +29,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_DIAG_MASK_ZERO:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_ADD:
-        case GGML_OP_ADD_ID:
        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
@@ -49,10 +49,6 @@
 #include "ggml-webgpu.h"
 #endif

-#ifdef GGML_USE_ZDNN
-#include "ggml-zdnn.h"
-#endif
-
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@@ -184,9 +180,6 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_WEBGPU
        register_backend(ggml_backend_webgpu_reg());
 #endif
-#ifdef GGML_USE_ZDNN
-        register_backend(ggml_backend_zdnn_reg());
-#endif
 #ifdef GGML_USE_OPENCL
        register_backend(ggml_backend_opencl_reg());
 #endif
@@ -1071,11 +1071,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                }
            }
        }
-        // if the node is still unassigned, assign it to the first backend that supports it
-        for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
-            ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
-        }
-        GGML_ASSERT(*cur_backend_id != -1);
    }

    // pass 5: split graph, find tensors that need to be copied
@@ -1103,7 +1098,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

            const int node_backend_id = tensor_backend_id(node);

-            GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
+            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback

            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
@@ -1161,7 +1156,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

                size_t src_id = hash_id(src);
                const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
-                GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
+                assert(src_backend_id != -1); // all inputs should be assigned by now

                if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
                    if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
@@ -281,10 +281,10 @@ ggml_backend_t ggml_backend_blas_init(void) {
    ggml_backend_blas_context * ctx = new ggml_backend_blas_context;

    ggml_backend_t backend = new ggml_backend {
-        /* .guid    = */ ggml_backend_blas_guid(),
-        /* .iface   = */ blas_backend_i,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
-        /* .context = */ ctx,
+        /* .guid      = */ ggml_backend_blas_guid(),
+        /* .interface = */ blas_backend_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
+        /* .context   = */ ctx,
    };

 #if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
@@ -31,13 +31,6 @@ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
 string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
 message(STATUS "CANN: SOC_VERSION =  ${SOC_VERSION}")
-option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
-
-if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
-    message(FATAL_ERROR
-        "CANN Graph (ACL graph mode) is not supported on 310P devices. "
-        "Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
-endif()

 if (CANN_INSTALL_DIR)
    # Only Support Linux.
@@ -75,13 +68,6 @@ if (CANN_INSTALL_DIR)

    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")

-    if (USE_ACL_GRAPH)
-        target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
-        message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
-    else()
-        message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
-    endif()
-
    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
 else()
@@ -753,55 +753,69 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];

+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    if (ggml_are_same_shape(src0, dst)) {
-        aclTensor* acl_src = ggml_cann_create_tensor(src0);
-        aclTensor* acl_dst = ggml_cann_create_tensor(dst);
        if (dst->type == src0->type) {
            cann_copy(ctx, acl_src, acl_dst);
        } else {
            aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
        }
-        ggml_cann_release_resources(ctx, acl_src, acl_dst);
    } else {
-        void* src_trans_buffer = src0->data;
-        ggml_cann_pool_alloc src_buffer_allocator;
-        if (!ggml_is_contiguous(src0)) {
-            aclTensor* acl_src = ggml_cann_create_tensor(src0);
-            src_buffer_allocator.alloc(ctx.pool(),
-                ggml_nelements(src0) * ggml_type_size(src0->type));
-            src_trans_buffer = src_buffer_allocator.get();
+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+            if (dst->type == src0->type) {
+                size_t cpy_size = ggml_nbytes(dst);
+                ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE);
+                return;
+            } else {
+                ggml_cann_pool_alloc src_buffer_allocator(
+                    ctx.pool(),
+                    ggml_nelements(dst) * ggml_type_size(dst->type));
+                void* src_trans_buffer = src_buffer_allocator.get();
+                size_t src_trans_nb[GGML_MAX_DIMS];
+                src_trans_nb[0] = ggml_type_size(dst->type);
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                }
+                aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                    src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                    ggml_type_size(dst->type), src0->ne, src_trans_nb,
+                    GGML_MAX_DIMS);
+
+                aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
+                size_t cpy_size = ggml_nbytes(dst);
+                ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE);
+                ggml_cann_release_resources(ctx, src_trans_tensor);
+                return;
+            }
+        } else if (ggml_is_contiguous(dst)) {
+            ggml_cann_pool_alloc src_buffer_allocator(
+                ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
+            void* src_trans_buffer = src_buffer_allocator.get();
            size_t src_trans_nb[GGML_MAX_DIMS];
-            src_trans_nb[0] = ggml_type_size(src0->type);
+            src_trans_nb[0] = ggml_type_size(dst->type);
            for (int i = 1; i < GGML_MAX_DIMS; i++) {
                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
            }
            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
-                src_trans_buffer, ggml_cann_type_mapping(src0->type),
-                ggml_type_size(src0->type), src0->ne, src_trans_nb,
+                src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                ggml_type_size(dst->type), src0->ne, src_trans_nb,
                GGML_MAX_DIMS);
-            cann_copy(ctx, acl_src, src_trans_tensor);
-            ggml_cann_release_resources(ctx, acl_src, src_trans_tensor);
-        }

-        size_t src_reshape_nb[GGML_MAX_DIMS];
-        src_reshape_nb[0] = ggml_type_size(src0->type);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
-        }
+            aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));

-        aclTensor* trans_acl_src = ggml_cann_create_tensor(src_trans_buffer,
-            ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type),
-            dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
-        aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-
-        if (dst->type == src0->type) {
-            cann_copy(ctx, trans_acl_src, acl_dst);
+            size_t cpy_size = ggml_nbytes(dst);
+            ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
+                ACL_MEMCPY_DEVICE_TO_DEVICE);
+            ggml_cann_release_resources(ctx, src_trans_tensor);
+            return;
        } else {
-            aclnn_cast(ctx, trans_acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
+            GGML_ABORT("Unsupport dst is not tontiguous.");
        }
-        ggml_cann_release_resources(ctx, trans_acl_src, acl_dst);
    }
-    return;
+    ggml_cann_release_resources(ctx, acl_src, acl_dst);
 }

 /**
@@ -1316,196 +1330,160 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
 }

 /**
- * @brief Generate a range of values and apply a scalar base exponentiation.
+ * @brief   Applies the Alibi (Attention with Linear Biases) mechanism to the
+ * @details This function implements the Alibi mechanism, which introduces
+ *          learnable biases into the attention scores to simulate relative
+ *          position encoding without the need for explicit positional
+ *          embeddings.
 *
- * This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
- * with step size `step`, stores it in a temporary buffer, and then computes:
+ * @param ctx          The backend CANN context for executing operations.
+ * @param acl_src      The source tensor representing the query or key.
+ * @param acl_position The position tensor containing relative positions.
+ * @param acl_dst      The destination tensor where the result will be stored.
+ * @param n_head       The number of attention heads.
+ * @param src_ne       The dimensions of the source tensor.
+ * @param src_nb0      The byte size of the first dimension of the source
+ tensor.
+ * @param max_bias     The maximum bias value used in the Alibi mechanism.
+ * @param dst          The destination tensor object for additional metadata.
 *
- * @f[
- * slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
- * @f]
- *
- * The results are written to the provided @p slope_buffer.
- *
- * @param ctx           CANN backend context for memory allocation and operator execution.
- * @param slope_buffer  Pointer to the output buffer (float array) for the computed slope values.
- * @param m             Scalar base for the exponentiation.
- * @param size          Number of elements in the generated sequence.
- * @param start         Starting exponent offset.
- * @param stop          Stopping exponent offset (exclusive).
- * @param step          Step size for the exponent increment.
+ * The function performs the following steps:
+ * 1. Calculates the logarithm floor of the number of heads to determine the
+      base for bias calculation.
+ * 2. Initializes arrays with arithmetic sequences and fills them with bias
+      values.
+ * 3. Computes the bias tensor based on the calculated biases and arithmetic
+      sequences.
+ * 4. Reshapes the bias tensor to match the dimensions of the input tensors.
+ * 5. Multiplies the position tensor by the bias tensor.
+ * 6. Adds the result of the multiplication to the source tensor to produce the
+      final output.
 */
-static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
-    float m, int64_t size, float start, float stop, float step){
-    int64_t ne[] = {size};
-    size_t nb[] = {sizeof(float)};
+static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                        aclTensor* acl_position, aclTensor* acl_dst,
+                        const int n_head, int64_t* src_ne, const size_t src_nb0,
+                        float max_bias, ggml_tensor* dst) {
+    const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
+    GGML_ASSERT(src_nb0 == sizeof(float));
+    GGML_ASSERT(n_head == src_ne[2]);

-    ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(float));
-    void* arange_buffer = arange_allocator.get();
+    const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));

-    aclTensor* arange_tensor = ggml_cann_create_tensor(
-        arange_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
-    aclnn_arange(ctx, arange_tensor, start, stop, step, size);
+    float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);

-    aclTensor* slope_tensor = ggml_cann_create_tensor(
-        slope_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
+    // init arange
+    ggml_cann_pool_alloc arange_allocator(ctx.pool(),
+                                          ne2_ne3 * ggml_type_size(dst->type));
+    void* tmp_arange_buffer = arange_allocator.get();

-    aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
+    // arange1: [1, ..., n_heads_log2_floor+1)
+    float start = 1;
+    float stop = n_heads_log2_floor + 1;
+    float step = 1;
+    int64_t n_elements_arange = n_heads_log2_floor;

-    GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
-    ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor);
+    int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
+    size_t tmp_arange1_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
+        tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
+
+    aclTensor* tmp_arange2_tensor = nullptr;
+    if (n_heads_log2_floor < ne2_ne3) {
+        // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
+        start = 1;
+        stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
+        step = 2;
+        n_elements_arange = ne2_ne3 - n_heads_log2_floor;
+        int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t tmp_arange2_nb[] = {sizeof(dst->type)};
+
+        aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
+            (char*)tmp_arange_buffer +
+                n_heads_log2_floor * ggml_type_size(dst->type),
+            ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+            tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
+                     n_elements_arange);
+    }
+
+    // init mk_base
+    ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
+                                           ne2_ne3 * ggml_type_size(dst->type));
+    void* tmp_mk_base_buffer = mk_base_allocator.get();
+    int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
+    size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
+
+    aclTensor* tmp_mk_base2_tensor = nullptr;
+    if (n_heads_log2_floor < ne2_ne3) {
+        int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
+        aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
+            (char*)tmp_mk_base_buffer +
+                n_heads_log2_floor * ggml_type_size(dst->type),
+            ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+            tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
+    }
+
+    // init mk
+    int64_t tmp_mk_base_ne[] = {ne2_ne3};
+    size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
+        tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
+
+    // reshape mk
+    int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
+    size_t tmp_mk_nb[GGML_MAX_DIMS];
+    tmp_mk_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
+    }
+    aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+
+    // acl_position * mk
+    int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
+    size_t tmp_output_nb[GGML_MAX_DIMS];
+    tmp_output_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
+    }
+    ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
+    void* tmp_output_buffer = output_allocator.get();
+    aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
+        tmp_output_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+    aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
+
+    // add
+    aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
+    ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
+        tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
+        tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
 }

-/**
- * @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
- *
- * This function generates slope values for each attention head according to the ALiBi
- * (Attention with Linear Biases) method. It splits the computation into two ranges depending
- * on whether the head index is less than @p n_head_log2 or not, and uses different base values
- * (`m0` and `m1`) for the exponentiation.
- *
- * @f[
- * slope[h] =
- * \begin{cases}
- * m_0^{(h + 1)}, & h < n\_head\_log2 \\
- * m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
- * \end{cases}
- * \quad , \quad \text{if } max\_bias > 0
- * @f]
- *
- * If @p max_bias <= 0, all slope values are set to 1.0.
- *
- * @param ctx           CANN backend context for memory allocation and operator execution.
- * @param n_head        Total number of attention heads.
- * @param slope_buffer  Pointer to the output buffer (float array) for storing slopes.
- * @param max_bias      Maximum bias value for slope computation.
- *
-*/
-static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
-    void* slope_buffer, float max_bias) {
-    const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
-
-    float m0 = powf(2.0f, -(max_bias) / n_head_log2);
-    float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-    // const float slope = (max_bias > 0.0f) ?
-    //                          h < n_head_log2 ?
-    //                              powf(m0, h + 1) :
-    //                              powf(m1, 2*(h - n_head_log2) + 1) :
-    //                          1.0f;
-    // arange1
-    float start = 0 + 1;
-    float end   = (n_head_log2 - 1) + 1;
-    float step  = 1;
-    float count = n_head_log2;
-    // end needs to be +1 because aclnn uses a left-closed, right-open interval.
-    aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
-    if (n_head_log2 < n_head) {
-        // arange2
-        start = 2 * (n_head_log2 - n_head_log2) + 1;
-        end   = 2 * ((n_head - 1) - n_head_log2) + 1;
-        step  = 2;
-        count = n_head - n_head_log2;
-        aclnn_get_slope_inner(
-            ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
-            m1, count, start, end + 1, step);
-    }
-}
-
-/**
- * @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
- *
- * This function computes the ALiBi slopes for each attention head (if max_bias > 0),
- * multiplies them with the attention mask to produce bias tensors, and adds these biases
- * to the destination tensor (@p dst).
- *
- * The function performs necessary broadcasting of the mask and slope tensors to match
- * the shape of the destination tensor, then applies element-wise multiplication and addition
- * using CANN operators.
- *
- * @param ctx         CANN backend context for memory management and operator execution.
- * @param mask        Input attention mask tensor, assumed to be contiguous.
- * @param dst         Destination tensor to which ALiBi biases will be added.
- * @param dst_ptr     Pointer to the memory of the destination tensor.
- * @param max_bias    Maximum bias value controlling the slope scaling.
- *
- * @note
- * - Write data into dst_ptr using only the shape information of the dst tensor.
- * - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
- */
-static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
-    ggml_tensor* dst, void* dst_ptr, float max_bias) {
-    void* slope_buffer = nullptr;
-    void* bias_buffer = nullptr;
-
-    if (max_bias > 0.0f) {
-        int64_t n_heads = dst->ne[2];
-        ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
-        slope_buffer = slope_allocator.get();
-        ggml_cann_pool_alloc bias_allocator(
-                    ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
-        bias_buffer = bias_allocator.get();
-        aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
-    }
-
-    // broadcast for mask, slop and dst;
-    int64_t nr2 = dst->ne[2] / mask->ne[2];
-    int64_t nr3 = dst->ne[3] / mask->ne[3];
-
-    // broadcast the mask across rows
-    int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
-    size_t  mask_nb[] = {
-        mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
-        mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
-    };
-
-    int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
-    size_t  dst_nb[] = {
-        dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
-        dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
-    };
-
-    // slope is a 1 dim tensor, slope.ne2 == dst.ne2
-    int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
-    size_t  slope_nb[GGML_MAX_DIMS + 2];
-    slope_nb[0] = sizeof(float);
-    for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
-        slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
-    }
-
-    aclTensor* acl_slope = ggml_cann_create_tensor(
-                            slope_buffer, ACL_FLOAT, sizeof(float),
-                            slope_ne, slope_nb, GGML_MAX_DIMS + 2);
-    aclTensor* acl_mask = ggml_cann_create_tensor(
-                            mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
-
-    // write data into dst_ptr using only the shape information of the dst tensor.
-    aclTensor* acl_dst  = ggml_cann_create_tensor(
-                            dst_ptr, ggml_cann_type_mapping(dst->type),
-                            ggml_type_size(dst->type), dst_ne, dst_nb,
-                            GGML_MAX_DIMS + 2);
-
-    if (max_bias > 0.0f) {
-        int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
-        size_t  bias_nb[GGML_MAX_DIMS + 2];
-        bias_nb[0] = sizeof(float);
-        for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
-            bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
-        }
-        aclTensor* bias_tensor = ggml_cann_create_tensor(
-                                    bias_buffer, ACL_FLOAT, sizeof(float),
-                                    bias_ne, bias_nb, GGML_MAX_DIMS + 2);
-
-        aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
-        aclnn_add(ctx, acl_dst, bias_tensor);
-        ggml_cann_release_resources(ctx, bias_tensor);
-    } else {
-        aclnn_add(ctx, acl_dst, acl_mask);
-    }
-    ggml_cann_release_resources(ctx, acl_slope, acl_mask, acl_dst);
-}
-
-void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_cann_dup(ctx, dst);
 }

@@ -1523,41 +1501,118 @@ void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 * @param acl_dst The destination tensor where the softmax results will be
 * stored.
 */
-static void aclnn_softmax(ggml_backend_cann_context & ctx,
-    aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
+static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                          int64_t dim, aclTensor* acl_dst) {
    GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
 }

-void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];
    ggml_tensor* src1 = dst->src[1];  // mask

    aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
-    aclTensor* acl_dst  = ggml_cann_create_tensor(dst);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

-    float scale    = 1.0f;
+    float scale = 1.0f;
    float max_bias = 0.0f;

-    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));

    // input mul scale
    aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
-    ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
-    void* src_tensor_buffer = src_tensor_allocator.get();
-    aclTensor* softmax_tensor = ggml_cann_create_tensor(
-        src_tensor_buffer, ggml_cann_type_mapping(src0->type),
-        ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);

-    aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
+    size_t n_bytes = ggml_nbytes(src0);
+    ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
+    void* input_mul_scale_buffer = mul_scale_allocator.get();
+    aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
+        input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
+        src0->nb, GGML_MAX_DIMS);
+
+    bool inplace = false;
+    aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);

    // mask
+    aclTensor* acl_src1_fp32_tensor = nullptr;
+    aclTensor* tmp_mask_tensor = nullptr;
+    ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
    if (src1) {
-        aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
+        const bool use_f16 = src1->type == GGML_TYPE_F16;
+        if (use_f16) {
+            // cast to fp32
+            size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
+            size_t src1_fp32_nb[GGML_MAX_DIMS];
+            src1_fp32_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
+            }
+            src1_fp32_allocator.alloc(n_bytes);
+            void* src1_fp32_buffer = src1_fp32_allocator.get();
+            acl_src1_fp32_tensor = ggml_cann_create_tensor(
+                src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
+                src1_fp32_nb, GGML_MAX_DIMS);
+            aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
+            aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
+            ggml_cann_release_resources(ctx, acl_src1);
+        } else {
+            acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
+        }
+
+        // broadcast the mask across rows, only use ne11 of ne01 in mask
+        if (src1->ne[1] != src0->ne[1]) {
+            // mask shape: [1,1,ne11,ne10]
+            int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
+            size_t tmp_mask_nb[GGML_MAX_DIMS];
+            tmp_mask_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
+            }
+            tmp_mask_tensor = ggml_cann_create_tensor(
+                src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
+                GGML_MAX_DIMS, ACL_FORMAT_ND);
+        }
+
+        // alibi
+        const int n_head = src0->ne[2];
+        const size_t src_nb0 = src0->nb[0];
+
+        n_bytes = ggml_nbytes(dst);
+        ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
+        void* output_buffer = output_allocator.get();
+        aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
+            output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
+            dst->nb, GGML_MAX_DIMS);
+        if (max_bias <= 0.0f) {
+            // slope = 1.0
+            if (tmp_mask_tensor) {
+                aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
+                          alibi_output_tensor);
+            } else {
+                aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
+                          alibi_output_tensor);
+            }
+        } else {
+            // slope != 1.0
+            if (tmp_mask_tensor) {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
+                            alibi_output_tensor, n_head, src0->ne, src_nb0,
+                            max_bias, dst);
+            } else {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor,
+                            acl_src1_fp32_tensor, alibi_output_tensor, n_head,
+                            src0->ne, src_nb0, max_bias, dst);
+            }
+        }
+
+        // softmax
+        aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
+        ggml_cann_release_resources(ctx, alibi_output_tensor);
+    } else {
+        aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
    }
-    // softmax
-    aclnn_softmax(ctx, softmax_tensor, 3, acl_dst);
-    ggml_cann_release_resources(ctx, acl_src0, acl_dst, acl_scale, softmax_tensor);
+
+    ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
+        acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
 }

 /**
@@ -3153,24 +3208,104 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
            // Compute the slope if needed. Derived from ggml_cann_softmax().
            if(maxBias != 0.0f){
                // alibi
-                const int64_t n_heads = src0->ne[2];
-                ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
-                void* slope_buffer = slope_allocator.get();
-                aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
+                const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
+                const int64_t n_head = src0->ne[2];
+                const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
+                float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
+                float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
+                // init arange
+                ggml_cann_pool_alloc arange_allocator(ctx.pool(),
+                                                    ne2_ne3 * faElemSize);
+                void* tmp_arange_buffer = arange_allocator.get();

-                int64_t slope_ne[] = {1, 1, n_heads, 1};
-                size_t slope_nb[GGML_MAX_DIMS];
-                slope_nb[0] = sizeof(float);
-                for(int i = 1;i<GGML_MAX_DIMS;i++) {
-                    slope_nb[i] = slope_nb[i-1] * slope_ne[0];
+                // arange1: [1, ..., n_heads_log2_floor+1)
+                float start = 1;
+                float stop = n_heads_log2_floor + 1;
+                float step = 1;
+                int64_t n_elements_arange = n_heads_log2_floor;
+
+                int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
+                size_t tmp_arange1_nb[] = {faElemSize};
+                aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
+                    tmp_arange_buffer, faDataType, faElemSize,
+                    tmp_arange1_ne, tmp_arange1_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+                aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
+
+                aclTensor* tmp_arange2_tensor = nullptr;
+                if (n_heads_log2_floor < ne2_ne3) {
+                    // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
+                    start = 1;
+                    stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
+                    step = 2;
+                    n_elements_arange = ne2_ne3 - n_heads_log2_floor;
+                    int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+                    size_t tmp_arange2_nb[] = {faElemSize};
+
+                    aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
+                        (char*)tmp_arange_buffer +
+                            n_heads_log2_floor * faElemSize,
+                        faDataType, faElemSize,
+                        tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                    aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
+                                n_elements_arange);
                }

-                aclTensor* slope_tensor = ggml_cann_create_tensor(
-                    slope_buffer, ACL_FLOAT, sizeof(float),
-                    slope_ne, slope_nb, GGML_MAX_DIMS);
-                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
+                // init mk_base
+                ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
+                                                    ne2_ne3 * faElemSize);
+                void* tmp_mk_base_buffer = mk_base_allocator.get();
+                int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
+                size_t tmp_mk_base1_nb[] = {faElemSize};
+                aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_base1_ne, tmp_mk_base1_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);

-                ggml_cann_release_resources(ctx, slope_tensor);
+                aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
+
+                aclTensor* tmp_mk_base2_tensor = nullptr;
+                if (n_heads_log2_floor < ne2_ne3) {
+                    int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+                    size_t tmp_mk_base2_nb[] = {faElemSize};
+                    aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
+                        (char*)tmp_mk_base_buffer +
+                            n_heads_log2_floor * faElemSize,
+                        faDataType, faElemSize,
+                        tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                    aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
+                }
+
+                // init mk
+                int64_t tmp_mk_base_ne[] = {ne2_ne3};
+                size_t tmp_mk_base_nb[] = {faElemSize};
+                aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_base_ne, tmp_mk_base_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
+                    tmp_arange_buffer, faDataType, faElemSize,
+                    tmp_mk_base_ne, tmp_mk_base_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
+
+                // reshape mk
+                int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
+                size_t tmp_mk_nb[GGML_MAX_DIMS];
+                tmp_mk_nb[0] = faElemSize;
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
+                }
+                aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
+                    ACL_FORMAT_ND);
+                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
+
+                ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
+                    tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
+                    tmp_arange_tensor, tmp_mk_tensor);
            }
        }

@@ -337,29 +337,6 @@ private:
    int32_t device_;
 };

-#ifdef USE_ACL_GRAPH
-struct ggml_graph_node_properties {
-    void * node_address;
-    ggml_op node_op;
-    int64_t ne[GGML_MAX_DIMS];
-    size_t nb[GGML_MAX_DIMS];
-    void * src_address[GGML_MAX_SRC];
-    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-};
-
-struct ggml_cann_graph {
-    ~ggml_cann_graph() {
-        if (graph != nullptr) {
-            aclmdlRIDestroy(graph);
-        }
-    }
-
-    aclmdlRI graph = nullptr;
-
-    std::vector<ggml_graph_node_properties> ggml_graph_properties;
-};
-#endif  // USE_ACL_GRAPH
-
 /**
 * @brief Context for managing CANN backend operations.
 */
@@ -368,13 +345,8 @@ struct ggml_backend_cann_context {
    std::string name;                /**< Name of the device. */
    std::string description;         /**< Description of the device. */
    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
-#ifdef USE_ACL_GRAPH
-    /// Cached CANN ACL graph used for executing the current ggml computation graph.
-    std::unique_ptr<ggml_cann_graph> cann_graph;
-#endif
    cann_task_queue task_queue;
    bool async_mode;
-    bool support_set_rows;

    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */

@@ -390,14 +362,6 @@ struct ggml_backend_cann_context {
        async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
        GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
            device, async_mode ? "ON" : "OFF");
-
-        support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
-        GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
-
-        if (!support_set_rows) {
-            GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
-                    "Falling back to eager mode.\n", __func__);
-        }
    }

    /**
@@ -2075,160 +2075,6 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
    ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
 }

-#ifdef USE_ACL_GRAPH
-/**
- * @brief Populate the internal CANN graph node properties from the ggml computation graph.
- *
- * This function copies all node attributes (operation type, dimensions, strides, input sources,
- * and operation parameters) into the cached CANN graph structure for later reuse or comparison.
- *
- * @param cann_ctx  The CANN backend context.
- * @param cgraph    The ggml computational graph.
- */
-static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
-    for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
-        ggml_tensor * node = cgraph->nodes[node_idx];
-        cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
-        cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
-
-        for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
-            cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
-            cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
-        }
-        for (int src = 0; src < GGML_MAX_SRC; src++) {
-            cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
-                node->src[src] ? node->src[src]->data : nullptr;
-        }
-        memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
-    }
-}
-
-/**
- * @brief Check if a ggml tensor node matches a previously captured CANN graph node.
- *
- * This function compares all relevant fields (address, op type, shape, source inputs, op params)
- * to determine whether the current node matches a previously recorded version.
- *
- * @param node                  The current ggml tensor node.
- * @param graph_node_properties The stored properties of a CANN graph node.
- * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
- */
-static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
-    if (node->data != graph_node_properties->node_address &&
-           node->op != GGML_OP_VIEW) {
-        return false;
-    }
-    if (node->op != graph_node_properties->node_op) {
-        return false;
-    }
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != graph_node_properties->ne[i]) {
-            return false;
-        }
-        if (node->nb[i] != graph_node_properties->nb[i]) {
-            return false;
-        }
-    }
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (node->src[i] &&
-            node->src[i]->data != graph_node_properties->src_address[i] &&
-            node->op != GGML_OP_VIEW
-        ) {
-            return false;
-        }
-    }
-    if (node->op == GGML_OP_SCALE &&
-        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
-        return false;
-    }
-    return true;
-}
-
-/**
- * @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
- *
- * This checks whether the number or properties of ggml graph nodes have changed
- * compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
- *
- * @param cann_ctx  The CANN backend context.
- * @param cgraph    The current ggml computation graph.
- * @return true if an update is required; false otherwise.
- */
-static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
-    // The number of nodes is different, so the graph needs to be reconstructed.
-    if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
-        cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
-        return true;
-    }
-
-    // The number of nodes is the same; iterate over each node to check whether they match.
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        bool has_matching_properties = ggml_graph_node_has_matching_properties(
-            cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
-        if(!has_matching_properties) {
-            return true;
-        }
-    }
-    return false;
-}
-#endif  // USE_ACL_GRAPH
-
-/**
- * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
- *
- * If CANN graph execution is enabled and graph capture is required, this function begins
- * graph capture, runs the graph, ends capture, and stores the captured graph.
- *
- * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
- *
- * @param cann_ctx                 The CANN backend context.
- * @param cgraph                   The ggml computation graph.
- * @param use_cann_graph           Whether to use CANN graph execution.
- * @param cann_graph_update_required Whether graph capture is needed due to graph changes.
- */
-static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
-    bool & use_cann_graph, bool & cann_graph_update_required) {
-#ifdef USE_ACL_GRAPH
-    if (use_cann_graph && cann_graph_update_required) {
-        if (cann_ctx->cann_graph->graph != nullptr) {
-            ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
-            cann_ctx->cann_graph->graph = nullptr;
-        }
-        ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
-    }
-#endif // USE_ACL_GRAPH
-
-    // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
-    // With the use of CANN graphs, the execution will be performed by the graph launch.
-    if (!use_cann_graph || cann_graph_update_required) {
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            ggml_tensor * node = cgraph->nodes[i];
-
-            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-                continue;
-            }
-
-            bool ok = ggml_cann_compute_forward(*cann_ctx, node);
-            if (!ok) {
-                GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-            }
-            GGML_ASSERT(ok);
-        }
-    }
-
-#ifdef USE_ACL_GRAPH
-    if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
-        ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
-    }
-
-    if (use_cann_graph) {
-        // Execute graph
-        ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
-    }
-#endif // USE_ACL_GRAPH
-}
-
-
 /**
 * @brief Computes a computational graph using a CANN backend.
 *
@@ -2245,37 +2091,26 @@ static enum ggml_status ggml_backend_cann_graph_compute(
    ggml_backend_t backend, ggml_cgraph* cgraph) {
    ggml_backend_cann_context* cann_ctx =
        (ggml_backend_cann_context*)backend->context;
+
    ggml_cann_set_device(cann_ctx->device);
+    //release temp buffer create by set tensor.
    release_nz_workspace();
-#ifdef USE_ACL_GRAPH
-    bool use_cann_graph = true;
-    bool cann_graph_update_required = false;

-    // check environment LLAMA_SET_ROWS
-    if (!cann_ctx->support_set_rows) {
-        use_cann_graph = false;
-    }
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor* node = cgraph->nodes[i];

-    if (use_cann_graph) {
-        if (cann_ctx->cann_graph == nullptr) {
-            cann_ctx->cann_graph.reset(new ggml_cann_graph());
-            cann_graph_update_required = true;
+        if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
+            continue;
        }

-        cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
-        set_ggml_graph_node_properties(cann_ctx, cgraph);
-    }
-#else
-    bool use_cann_graph = false;
-    bool cann_graph_update_required = false;
-#endif  // USE_ACL_GRAPH
+        bool ok = ggml_cann_compute_forward(*cann_ctx, node);

-    evaluate_and_capture_cann_graph(
-        cann_ctx,
-        cgraph,
-        use_cann_graph,
-        cann_graph_update_required
-    );
+        if (!ok) {
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
+                    node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+    }

    return GGML_STATUS_SUCCESS;
 }
@@ -2391,6 +2226,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                // only support F32 and F16.
                return false;
            }
+
+            if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
+                // unsupport dst is not contiguous.
+                return false;
+            }
+
            return true;
        } break;
        case GGML_OP_CONT: {
@@ -2456,8 +2297,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            // value of paddingW should be at most half of kernelW
            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
        }
-        case GGML_OP_DUP:
        case GGML_OP_SUM:
+        case GGML_OP_DUP:
        case GGML_OP_IM2COL:
        case GGML_OP_CONCAT:
        case GGML_OP_REPEAT:
@@ -2499,11 +2340,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
            return bias == 0.0f; // TODO: support bias != 0.0f
        case GGML_OP_SOFT_MAX:
-            // TODO: support attention sinks [TAG_ATTN_SINKS]
-            if (op->src[2]) {
-                return false;
-            }
-            return true;
+            // TODO: support broadcast
+            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
+            return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
        case GGML_OP_FLASH_ATTN_EXT:{
            // derived from [ggml-cuda.cu]
            if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
@@ -2515,10 +2354,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
                return false;
            }
-            // TODO: support attention sinks [TAG_ATTN_SINKS]
-            if (op->src[4]) {
-                return false;
-            }
            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
                // different head sizes of K and V are not supported yet
                return false;
@@ -2530,6 +2365,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                // DeepSeek MLA
                return false;
            }
+            // TODO: support broadcast
+            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
+            if (op->src[0]->ne[3] != 1) {
+                return false;
+            }
            float logitSoftcap = 0.0f;
            memcpy(&logitSoftcap,  (float*)op->op_params + 2, sizeof(float));
            if(logitSoftcap != 0.0f) {
@@ -99,9 +99,6 @@ typedef sycl::half2 ggml_half2;
 #define QI4_1 (QK4_1 / (4 * QR4_1))
 #define QR4_1 2

-#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
-#define QR_MXFP4 2
-
 #define QI5_0 (QK5_0 / (4 * QR5_0))
 #define QR5_0 2

@@ -187,13 +184,6 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");

-#define QK_MXFP4 32
-typedef struct {
-    uint8_t e; // E8M0
-    uint8_t qs[QK_MXFP4/2];
-} block_mxfp4;
-static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
-
 #define QK5_0 32
 typedef struct {
    ggml_half d;           // delta
@@ -1084,17 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
    0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 GGML_TABLE_END()

-// TODO: fix name to kvalues_iq4_nl
 GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
    -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
 GGML_TABLE_END()

-// e2m1 values (doubled)
-// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
-    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
-GGML_TABLE_END()
-
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f
@@ -460,7 +460,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
            #       binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
            message(STATUS "z17 target")
-            list(APPEND ARCH_FLAGS -march=arch15)
+            list(APPEND ARCH_FLAGS -march=z17)
        else()
            message(STATUS "Unknown target")
            message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
@@ -13,7 +13,6 @@
 #define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
 #define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
 #define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -40,22 +39,18 @@
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
 // repack.cpp
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // repack.cpp
@@ -73,7 +68,6 @@
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -84,21 +78,18 @@
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__loongarch64)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -109,14 +100,12 @@
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__riscv)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -131,7 +120,6 @@
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -141,13 +129,11 @@
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__s390x__)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -163,7 +149,6 @@
 #define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
 #define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -174,14 +159,12 @@
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__wasm__)
 // quants.c
 #define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
@@ -196,7 +179,6 @@
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -207,12 +189,10 @@
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #endif
@@ -589,67 +589,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_MXFP4 == 0);
-    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
-
-    const block_mxfp4 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_MXFP4;
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_mxfp4);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    uint8x16x2_t q4bits;
-    int8x16x4_t q4b;
-    int8x16x4_t q8b;
-    int32x4_t prod_1;
-    int32x4_t prod_2;
-
-    for (; ib + 1 < nb; ib += 2) {
-        q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
-        q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
-        q8b.val[0]    = vld1q_s8(y[ib + 0].qs);
-        q8b.val[1]    = vld1q_s8(y[ib + 0].qs + 16);
-        q8b.val[2]    = vld1q_s8(y[ib + 1].qs);
-        q8b.val[3]    = vld1q_s8(y[ib + 1].qs + 16);
-
-        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
-        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
-        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
-
-        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
-        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
-
-        sumf +=
-            GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
-            GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
-    }
-
-#endif
-    for (; ib < nb; ++ib) {
-        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
-        int sumi1 = 0;
-        int sumi2 = 0;
-        for (int j = 0; j < QK_MXFP4/2; ++j) {
-            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
-        }
-        sumf += d * (sumi1 + sumi2);
-    }
-    *s = sumf;
-}
-
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -66,12 +66,6 @@ static inline int hsum_i32_4(const __m128i a) {
 }

 #if defined(__AVX2__) || defined(__AVX512F__)
-static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return _mm256_maddubs_epi16(ax, sy);
-}
-
 // spread 32 bits to 32 bytes { 0x00, 0xFF }
 static inline __m256i bytes_from_bits_32(const uint8_t * x) {
    uint32_t x32;
@@ -267,11 +261,6 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
    return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
                           _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
 }
-
-static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
-    return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
-                           _mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
-}
 #endif
 #elif defined(__SSSE3__)
 // horizontally add 4x4 floats
@@ -757,91 +746,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }

-void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_MXFP4 == 0);
-    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
-
-    const block_mxfp4 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_MXFP4;
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined __AVX2__
-
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-    const __m256i mone = _mm256_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
-        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
-        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
-        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
-        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
-        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
-        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
-                _mm256_cvtepi32_ps(p_1), accum1);
-        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
-                _mm256_cvtepi32_ps(p_2), accum2);
-    }
-
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
-
-#elif defined __AVX__
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-
-    __m256 accum = _mm256_setzero_ps();
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
-        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
-        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
-        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
-        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
-
-        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
-        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
-        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
-        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
-
-        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
-        const __m256 deltas = quad_mx_delta_float(x[ib].e, y[ib].d, x[ib + 1].e, y[ib + 1].d);
-        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
-    }
-
-    sumf = hsum_float_8(accum);
-
-#endif
-    for (; ib < nb; ++ib) {
-        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
-        int sumi1 = 0;
-        int sumi2 = 0;
-        for (int j = 0; j < QK_MXFP4/2; ++j) {
-            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
-        }
-        sumf += d * (sumi1 + sumi2);
-    }
-    *s = sumf;
-}
-
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -3302,6 +3206,14 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 #endif
 }

+#if defined(__AVX2__)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return _mm256_maddubs_epi16(ax, sy);
+}
+#endif
+
 void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
@@ -253,12 +253,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_1,
        .nrows                    = 1,
    },
-    [GGML_TYPE_MXFP4] = {
-        .from_float               = quantize_row_mxfp4,
-        .vec_dot                  = ggml_vec_dot_mxfp4_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-    },
    [GGML_TYPE_Q2_K] = {
        .from_float               = quantize_row_q2_K,
        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
@@ -1676,10 +1670,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_add(params, tensor);
            } break;
-        case GGML_OP_ADD_ID:
-            {
-                ggml_compute_forward_add_id(params, tensor);
-            } break;
        case GGML_OP_ADD1:
            {
                ggml_compute_forward_add1(params, tensor);
@@ -1934,7 +1924,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            } break;
        case GGML_OP_FLASH_ATTN_EXT:
            {
-                ggml_compute_forward_flash_attn_ext(params, tensor);
+                ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
            } break;
        case GGML_OP_FLASH_ATTN_BACK:
            {
@@ -2022,11 +2012,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                ggml_compute_forward_opt_step_adamw(params, tensor);
            }
            break;
-        case GGML_OP_OPT_STEP_SGD:
-            {
-                ggml_compute_forward_opt_step_sgd(params, tensor);
-            }
-            break;
        case GGML_OP_NONE:
            {
                // nop
@@ -2126,7 +2111,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_DUP:
        case GGML_OP_CONT:
        case GGML_OP_ADD:
-        case GGML_OP_ADD_ID:
        case GGML_OP_ADD1:
        case GGML_OP_ACC:
            {
@@ -2188,7 +2172,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_GLU_OP_REGLU:
                case GGML_GLU_OP_GEGLU:
                case GGML_GLU_OP_SWIGLU:
-                case GGML_GLU_OP_SWIGLU_OAI:
                case GGML_GLU_OP_GEGLU_ERF:
                case GGML_GLU_OP_GEGLU_QUICK:
                    {
@@ -2330,7 +2313,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_CROSS_ENTROPY_LOSS:
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
        case GGML_OP_OPT_STEP_ADAMW:
-        case GGML_OP_OPT_STEP_SGD:
            {
                n_tasks = n_threads;
            } break;
@@ -2691,7 +2673,6 @@ struct ggml_cplan ggml_graph_plan(
                        }
                    } break;
                case GGML_OP_ADD:
-                case GGML_OP_ADD_ID:
                case GGML_OP_ADD1:
                    {
                        if (ggml_is_quantized(node->src[0]->type)) {
@@ -35,7 +35,7 @@

 // ggml-backend interface

-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
+std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
        std::vector<ggml_backend_buffer_type_t> bufts;

@@ -57,6 +57,8 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
        }
 #endif

+        bufts.push_back(NULL);
+
        return bufts;
    }();

@@ -64,20 +66,14 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
 }

 static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
-    static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
-        std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
-        bufts.push_back(nullptr);
-        return bufts;
-    }();
-
-    return extra_bufts.data();
+    return ggml_backend_cpu_get_extra_buffers_type().data();

    GGML_UNUSED(device);
 }

 static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
-    for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
-        if (extra == buft) {
+    for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra == buft) {
            return true;
        }
    }
@@ -214,10 +210,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
    ctx->abort_callback_data = NULL;

    ggml_backend_t cpu_backend = new ggml_backend {
-        /* .guid    = */ ggml_backend_cpu_guid(),
-        /* .iface   = */ ggml_backend_cpu_i,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ ctx,
+        /* .guid      = */ ggml_backend_cpu_guid(),
+        /* .interface = */ ggml_backend_cpu_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context   = */ ctx,
    };

    if (cpu_backend == NULL) {
@@ -401,13 +397,20 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
        return true;
    }

-    // check extra buffer types
-    // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
-    for (int i = 0; i < 4; i++) {
-        if (op->src[i] && op->src[i]->buffer &&
-            ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
-            auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
-            return buf_extra->supports_op(dev, op);
+    // extra_buffer_op?
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra) {
+            auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
+            if (buf_extra && buf_extra->supports_op(dev, op)) {
+                return true;
+            }
+        }
+    }
+
+    // the other case need host buffer.
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
+            return false;
        }
    }

@@ -259,10 +259,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
                const int64_t m_start      = 0;

                const int64_t n_step      = static_cast<int64_t>(kernel->get_n_step());
-                int64_t num_threads       = KAI_MIN(n / n_step, nth);
-                if (num_threads <= 0) {
-                    num_threads = 1;
-                }
+                const int64_t num_threads = KAI_MIN(n / n_step, nth);

                if (ith < num_threads) {
                    const int64_t num_n_per_thread0   = round_down(n / num_threads, n_step);
@@ -312,8 +309,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        GGML_ASSERT(kernel);

        const int ith = params->ith;
-        const int nth_raw = params->nth;
-        const int nth = nth_raw > 0 ? nth_raw : 1;
+        const int nth = params->nth;

        const size_t k = ne00;
        const size_t m = ne11;
@@ -331,12 +327,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
        const size_t n_start = ith * num_n_per_thread;

-        size_t n_to_process = 0;
-        if (n_start < n) {
-            n_to_process = num_n_per_thread;
-            if ((n_start + n_to_process) > n) {
-                n_to_process = n - n_start;
-            }
+        size_t n_to_process = num_n_per_thread;
+        if ((n_start + n_to_process) > n) {
+            n_to_process = n - n_start;
        }

        // Calculate number of columns to be processed per thread
@@ -368,10 +361,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
        float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);

-        if (n_to_process > 0) {
-            variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
-                               sizeof(float), -FLT_MAX, FLT_MAX);
-        }
+        variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
+                           sizeof(float), -FLT_MAX, FLT_MAX);

        return true;
    }
@@ -8,7 +8,6 @@
 #include "vec.h"

 #include <float.h>
-#include <algorithm>

 // ggml_compute_forward_dup

@@ -1284,7 +1283,6 @@ void ggml_compute_forward_add(
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -1311,77 +1309,6 @@ void ggml_compute_forward_add(
    }
 }

-// ggml_compute_forward_add_id
-
-static void ggml_compute_forward_add_id_f32(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-    const ggml_tensor * src2 = dst->src[2];
-
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(src2->type == GGML_TYPE_I32);
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(src1->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_TERNARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        // src1 indices
-        const int i11 = *(int32_t *) ((char *) src2->data + i1*nb20 + i2*nb21);
-
-        GGML_ASSERT(i11 >= 0 && i11 < ne11);
-
-        ggml_vec_add_f32(ne0,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                (float *) ((char *) src1->data + i11*nb11));
-    }
-}
-
-void ggml_compute_forward_add_id(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add_id_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("unsupported type for ggml_compute_forward_add_id: %s", ggml_type_name(src0->type));
-            }
-    }
-}
-
 // ggml_compute_forward_add1

 static void ggml_compute_forward_add1_f32(
@@ -1733,7 +1660,6 @@ void ggml_compute_forward_add1(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -1861,7 +1787,6 @@ void ggml_compute_forward_acc(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -3689,93 +3614,6 @@ static void ggml_compute_forward_swiglu(
    }
 }

-// ggml_compute_forward_swiglu_oai
-
-static void ggml_compute_forward_swiglu_oai_f32(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-    char * src0_d = (char *) src0->data;
-    char * src1_d = (char *) (src1 ? src1->data : src0->data);
-    const size_t src0_o = src0->nb[1];
-    const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
-
-    GGML_ASSERT(ggml_is_contiguous_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_1(dst));
-
-    if (src1) {
-        GGML_ASSERT(ggml_is_contiguous_1(src1));
-        GGML_ASSERT(src0->type == src1->type);
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
-    const int nr = ggml_nrows(src0);
-
-    GGML_ASSERT(dst->ne[0] == nc);
-    GGML_ASSERT(ggml_nrows(dst) == nr);
-
-    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
-    const float alpha = ggml_get_op_params_f32(dst, 2);
-    const float limit = ggml_get_op_params_f32(dst, 3);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * src0_p = (float *) (src0_d + i1*src0_o);
-        float * src1_p = (float *) (src1_d + i1*src1_o);
-        float * dst_p  = (float *) ((char *) dst->data + i1*(dst->nb[1]));
-
-        if (!src1) {
-            src0_p += swapped ? nc : 0;
-            src1_p += swapped ? 0 : nc;
-        }
-
-        for (int k = 0; k < nc; k++) {
-            const float x = std::min(src0_p[k], limit);
-            const float y = std::clamp(src1_p[k], -limit, limit);
-            const float out_glu = x / (1.f + expf(alpha * (-x)));
-            dst_p[k] = out_glu * (y + 1.f);
-        }
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = dst_p[k];
-            GGML_UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_swiglu_oai(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_swiglu_oai_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
 // ggml_compute_forward_geglu_erf

 static void ggml_compute_forward_geglu_erf_f32(
@@ -4761,7 +4599,6 @@ void ggml_compute_forward_out_prod(
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -5036,7 +4873,6 @@ void ggml_compute_forward_set(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -5298,7 +5134,6 @@ void ggml_compute_forward_get_rows(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -5688,7 +5523,6 @@ static void ggml_compute_forward_soft_max_f32(

    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
-    const ggml_tensor * src2 = dst->src[2];

    assert(ggml_is_contiguous(dst));
    assert(ggml_are_same_shape(src0, dst));
@@ -5723,9 +5557,6 @@ static void ggml_compute_forward_soft_max_f32(

    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);

-    // sinks
-    const float * sk = src2 ? (float *)((char *) src2->data) : nullptr;
-
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
@@ -5768,18 +5599,9 @@ static void ggml_compute_forward_soft_max_f32(
                float max = -INFINITY;
                ggml_vec_max_f32(ne00, &max, wp);

-                // if we have sinks, make a correction as if they were included in the softmax
-                if (sk) {
-                    max = MAX(max, sk[i02]);
-                }
-
                ggml_float sum = ggml_vec_soft_max_f32(ne00, dp, wp, max);
                assert(sum > 0.0);

-                if (sk) {
-                    sum += (ggml_float) expf(sk[i02] - max);
-                }
-
                sum = 1.0/sum;
                ggml_vec_scale_f32(ne00, dp, sum);

@@ -6014,7 +5836,6 @@ void ggml_compute_forward_clamp(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -8168,14 +7989,12 @@ void ggml_compute_forward_argsort(

 static void ggml_compute_forward_flash_attn_ext_f16(
        const ggml_compute_params * params,
+        const ggml_tensor * q,
+        const ggml_tensor * k,
+        const ggml_tensor * v,
+        const ggml_tensor * mask,
        ggml_tensor * dst) {

-    const ggml_tensor * q     = dst->src[0];
-    const ggml_tensor * k     = dst->src[1];
-    const ggml_tensor * v     = dst->src[2];
-    const ggml_tensor * mask  = dst->src[3];
-    const ggml_tensor * sinks = dst->src[4];
-
    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
@@ -8370,23 +8189,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
            }
        }

-        // sinks
-        if (sinks) {
-            const float s = ((float *)((char *) sinks->data))[h];
-
-            float ms = 1.0f;
-            float vs = 1.0f;
-
-            if (s > M) {
-                ms = expf(M - s);
-                ggml_vec_scale_f32(DV, VKQ32, ms);
-            } else {
-                vs = expf(s - M);
-            }
-
-            S = S*ms + vs;
-        }
-
        // V /= S
        const float S_inv = 1.0f/S;
        ggml_vec_scale_f32(DV, VKQ32, S_inv);
@@ -8406,13 +8208,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(

 void ggml_compute_forward_flash_attn_ext(
        const ggml_compute_params * params,
+        const ggml_tensor * q,
+        const ggml_tensor * k,
+        const ggml_tensor * v,
+        const ggml_tensor * mask,
        ggml_tensor * dst) {
    switch (dst->op_params[3]) {
        case GGML_PREC_DEFAULT:
        case GGML_PREC_F32:
            {
                // uses F32 accumulators
-                ggml_compute_forward_flash_attn_ext_f16(params, dst);
+                ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
            } break;
        default:
            {
@@ -9274,10 +9080,6 @@ void ggml_compute_forward_glu(
            {
                ggml_compute_forward_swiglu(params, dst);
            } break;
-        case GGML_GLU_OP_SWIGLU_OAI:
-            {
-                ggml_compute_forward_swiglu_oai(params, dst);
-            } break;
        case GGML_GLU_OP_GEGLU_ERF:
            {
                ggml_compute_forward_geglu_erf(params, dst);
@@ -10330,7 +10132,6 @@ static void ggml_compute_forward_opt_step_adamw_f32(
    const int ir1 = MIN(ir0 + dr, nr);

    const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
-
    const float alpha  = adamw_params_ptr[0];
    const float beta1  = adamw_params_ptr[1];
    const float beta2  = adamw_params_ptr[2];
@@ -10338,7 +10139,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
    const float wd     = adamw_params_ptr[4];
    const float beta1h = adamw_params_ptr[5];
    const float beta2h = adamw_params_ptr[6];
-    const float keep   = 1.f - alpha * wd;
+
    for (int ir = ir0; ir < ir1; ++ir) {
        const int64_t i03 = ir/(ne02*ne01);
        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
@@ -10361,7 +10162,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
            // The weight decay is applied independently of the Adam momenta m and v.
            // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
            // See: https://arxiv.org/pdf/1711.05101v3.pdf
-            w[i00] = w[i00] * keep - alpha * mh / vh;
+            w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
        }
    }
 }
@@ -10383,63 +10184,3 @@ void ggml_compute_forward_opt_step_adamw(
            }
    }
 }
-
-static void ggml_compute_forward_opt_step_sgd_f32(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0       = dst->src[0];
-    const ggml_tensor * src0_grad  = dst->src[1];
-    const ggml_tensor * sgd_params = dst->src[2];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
-    GGML_ASSERT(ggml_nelements(sgd_params) == 2);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1) / nth;
-
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // using adamw param subset we care about - alpha, wd - could have a separate struct
-    const float * sgd_params_ptr   = ggml_get_data_f32(sgd_params);
-    const float   alpha            = sgd_params_ptr[0];
-    const float   keep             = 1.f - alpha * sgd_params_ptr[1];
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        const int64_t i03 = ir / (ne02 * ne01);
-        const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
-        const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
-
-        const size_t offset = i03 * nb03 + i02 * nb02 + i01 * nb01;
-
-        float *       w = (float *) ((char *) src0->data + offset);                   // weight
-        const float * g = (const float *) ((const char *) src0_grad->data + offset);  // grad
-
-        for (int i00 = 0; i00 < ne00; ++i00) {
-            w[i00] = w[i00] * keep - alpha * g[i00];
-        }
-    }
-}
-
-void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_opt_step_sgd_f32(params, dst);
-            }
-            break;
-        default:
-            {
-                GGML_ABORT("fatal error - sgd is F32 only");
-            }
-    }
-}
@@ -29,7 +29,6 @@ extern "C" {

 void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_add_id(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -83,7 +82,13 @@ void ggml_compute_forward_arange(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_flash_attn_ext(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * q,
+    const struct ggml_tensor * k,
+    const struct ggml_tensor * v,
+    const struct ggml_tensor * mask,
+    struct ggml_tensor * dst);
 void ggml_compute_forward_flash_attn_back(
        const struct ggml_compute_params * params,
        const bool masked,
@@ -107,7 +112,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
 void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
 #ifdef __cplusplus
 }
 #endif
@@ -46,10 +46,6 @@ void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI
    quantize_row_q8_1_ref(x, y, k);
 }

-void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-    quantize_row_mxfp4_ref(x, y, k);
-}
-
 //
 // 2-6 bit quantization in super-blocks
 //
@@ -185,37 +181,6 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_MXFP4 == 0);
-    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
-
-    const block_mxfp4 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_MXFP4;
-
-    int ib = 0;
-    float sumf = 0;
-
-    for (; ib < nb; ++ib) {
-        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
-
-        int sumi1 = 0;
-        int sumi2 = 0;
-        for (int j = 0; j < QK_MXFP4/2; ++j) {
-            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
-        }
-        sumf += d * (sumi1 + sumi2);
-    }
-    *s = sumf;
-}
-
 void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -19,8 +19,6 @@ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

-void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
 void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -41,8 +39,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

-void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -71,12 +67,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
 void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
 void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
 void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -206,9 +206,8 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
    const int ncols_interleaved = 4;
    const int blocklen = 4;

-    assert(nr == 1);
-    assert(n % qk == 0);
-    assert(nc % ncols_interleaved == 0);
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);

    UNUSED(s);
    UNUSED(bs);
@@ -308,28 +307,30 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);

-    float sumf[8];
-    int sumi;
+    {
+        float sumf[8];
+        int sumi;

-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);

-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
                    }
-                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
                }
            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
    }
 }

@@ -493,73 +494,43 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
    const int ncols_interleaved = 4;
    const int blocklen = 4;

-    assert(nr == 1);
-    assert(n % qk == 0);
-    assert(nc % ncols_interleaved == 0);
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);

+    UNUSED(s);
    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);

-    float sumf[4];
-    int sumi;
+    {
+        float sumf[4];
+        int sumi;

-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);

-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
-                        const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
                    }
-                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
                }
            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
-    }
-}
-
-void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
-
-    assert(nr == 1);
-    assert(n % qk == 0);
-    assert(nc % ncols_interleaved == 0);
-
-    UNUSED(bs);
-    UNUSED(nr);
-
-    float sumf[8];
-    int sumi;
-
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
-
-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
-                        const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
-                    }
-                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
-                }
-            }
-        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
    }
 }

@@ -963,50 +934,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
    }
 }

-void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
-
-    assert(n % qk == 0);
-    assert(nr % 4 == 0);
-    assert(nc % ncols_interleaved == 0);
-
-    float sumf[4][8];
-    int sumi;
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-            }
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    for (int m = 0; m < 4; m++) {
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sumi = 0;
-                            for (int i = 0; i < blocklen; ++i) {
-                                const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
-                                const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
-                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
-                            }
-                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
-                        }
-                    }
-                }
-            }
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++)
-                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
-            }
-        }
-    }
-}
-
 } // extern "C"

 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
@@ -1358,16 +1285,15 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s

 static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
+    //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
    GGML_ASSERT(interleave_block == 4);

-    const block_iq4_nl   * src = (const block_iq4_nl   *)data;
-          block_iq4_nlx4 * dst = (      block_iq4_nlx4 *)t->data;
-
+    block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
+    const block_iq4_nl * src = (const block_iq4_nl *)data;
    block_iq4_nl dst_tmp[4];
-
    int nrow = ggml_nrows(t);
    int nrows_interleaved = 4;
-    int nblocks = t->ne[0] / QK4_NL;
+    int nblocks = t->ne[0] / QK4_0;

    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));

@@ -1389,63 +1315,6 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
    GGML_UNUSED(data_size);
 }

-static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
-    block_iq4_nlx8 out;
-
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i].d;
-    }
-
-    const int end = QK4_NL * 4 / blck_size_interleave;
-
-    if (blck_size_interleave == 8) {
-        for (int i = 0; i < end; ++i) {
-            int src_id = i % 8;
-            int src_offset = (i / 8) * blck_size_interleave;
-            int dst_offset = i * blck_size_interleave;
-
-            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    return out;
-}
-
-static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
-    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
-    GGML_ASSERT(interleave_block == 8);
-
-    const block_iq4_nl   * src = (const block_iq4_nl   *)data;
-          block_iq4_nlx8 * dst = (      block_iq4_nlx8 *)t->data;
-
-    block_iq4_nl dst_tmp[8];
-
-    int nrow = ggml_nrows(t);
-    int nrows_interleaved = 8;
-    int nblocks = t->ne[0] / QK4_NL;
-
-    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
-
-    if (t->ne[1] % nrows_interleaved != 0) {
-        return -1;
-    }
-
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
-        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i = 0; i < nrows_interleaved; i++) {
-                dst_tmp[i] = src[x + i * nblocks];
-            }
-            *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
-        }
-        src += nrows_interleaved * nblocks;
-    }
-    return 0;
-
-    GGML_UNUSED(data_size);
-}
-
 namespace ggml::cpu::repack {
 // repack
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -1481,10 +1350,6 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
 //    return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
 //}

-template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
-    return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
-}
-
 // gemv
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
 void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1513,10 +1378,6 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
 }

-template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
 // gemm
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
 void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1545,10 +1406,6 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
 }

-template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
 class tensor_traits_base : public ggml::cpu::tensor_traits {
  public:
    virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -1823,7 +1680,6 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons

    // instance for IQ4
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
-    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;

    if (cur->type == GGML_TYPE_Q4_0) {
        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
@@ -1854,11 +1710,6 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
            }
        }
    } else if (cur->type == GGML_TYPE_IQ4_NL) {
-        if (ggml_cpu_has_avx2()) {
-            if (cur->ne[1] % 8 == 0) {
-                return &iq4_nl_8x8_q8_0;
-            }
-        }
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
            if (cur->ne[1] % 4 == 0) {
                return &iq4_nl_4x4_q8_0;
@@ -67,13 +67,6 @@ struct block_iq4_nlx4 {

 static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");

-struct block_iq4_nlx8 {
-    ggml_half d[8];            // deltas for 8 iq4_nl blocks
-    uint8_t   qs[QK4_NL * 4];  // nibbles / quants for 8 iq4_nl blocks
-};
-
-static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
-
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -87,14 +80,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

 // Native implementations
 void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -106,14 +97,12 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

 #if defined(__cplusplus)
 } // extern "C"
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
 }  // namespace ggml::cpu

 bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
        if (extra && extra->context) {
            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
            auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
 }

 bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
        if (extra && extra->context) {
            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
            auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -33,6 +33,6 @@ class extra_buffer_type {
 }  // namespace ggml::cpu

 // implemented in ggml-cpu.cpp.
-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();

 #endif
@@ -55,22 +55,7 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)

 inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
-    int i = 0;
-#if defined(__AVX2__)
-    for (; i + 7 < n; i += 8) {
-        __m256 vx = _mm256_loadu_ps(x + i);
-        __m256 vy = _mm256_loadu_ps(y + i);
-        __m256 vz = _mm256_add_ps(vx, vy);
-        _mm256_storeu_ps(z + i, vz);
-    }
-#endif
-    for (; i < n; ++i) {
-        z[i] = x[i] + y[i];
-    }
-}
-
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
    for (int i = 0; i < n; ++i) {
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
@@ -1007,9 +992,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *

 inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
    for (int i = 0; i < n; ++i) {
-        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
-        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
-        y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
+        float v = GGML_CPU_FP16_TO_FP32(x[i]);
+        float w = GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
    }
 }

@@ -120,10 +120,6 @@ if (CUDAToolkit_FOUND)

    set(CUDA_FLAGS -use_fast_math -extended-lambda)

-    if (GGML_CUDA_DEBUG)
-        list(APPEND CUDA_FLAGS -lineinfo)
-    endif()
-
    if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
        # Options are:
        # - none (not recommended)
@@ -1,58 +0,0 @@
-#include "add-id.cuh"
-
-static __global__ void add_id_kernel(
-        const float * src0, const float * src1, const int32_t * src2, float * dst,
-        int64_t ne0, int64_t ne1,
-        size_t nb01, size_t nb02,
-        size_t nb11,
-        size_t nb21
-    ) {
-
-    const int64_t i1 = blockIdx.x;
-    const int64_t i2 = blockIdx.y;
-
-    const int i11 = *(int32_t *) ((char *) src2 + i1*sizeof(int32_t) + i2*nb21);
-
-    const size_t nb1 = ne0 * sizeof(float);
-    const size_t nb2 = ne1 * nb1;
-
-    float * dst_row = (float *)((char *)dst + i1*nb1 + i2*nb2);
-    const float * src0_row = (const float *)((char *)src0 +  i1*nb01 + i2*nb02);
-    const float * src1_row = (const float *)((char *)src1 + i11*nb11);
-
-    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
-        dst_row[i0] = src0_row[i0] + src1_row[i0];
-    }
-}
-
-void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-    const ggml_tensor * src2 = dst->src[2];
-
-    GGML_TENSOR_TERNARY_OP_LOCALS
-
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(src2->type == GGML_TYPE_I32);
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-    GGML_ASSERT(nb20 == sizeof(int32_t));
-
-    const float * src0_d = (const float *)src0->data;
-    const float * src1_d = (const float *)src1->data;
-    const int32_t * src2_d = (const int32_t *)src2->data;
-    float * dst_d = (float *)dst->data;
-
-    int threads = std::min((int)ne00, 768); // cols
-    dim3 blocks(ne01, ne02); // n_experts_used, n_tokens
-    add_id_kernel<<<blocks, threads, 0, ctx.stream()>>>(
-        src0_d, src1_d, src2_d, dst_d,
-        ne0, ne1,
-        nb01, nb02,
-        nb11,
-        nb21
-    );
-}
@@ -1,3 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -1,7 +1,6 @@
 #pragma once

 #include "ggml.h"
-#include "ggml-impl.h"
 #include "ggml-cuda.h"

 #include <cstdint>
@@ -87,10 +86,6 @@
 #define GGML_CUDA_CC_IS_QY2(cc)      (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
 #define GGML_CUDA_CC_IS_NG(cc)       (cc >= GGML_CUDA_CC_NG)

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
-#    define GGML_CUDA_USE_CUB
-#endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
-
 #ifdef __CUDA_ARCH_LIST__
 constexpr bool ggml_cuda_has_arch_impl(int) {
    return false;
@@ -237,13 +232,9 @@ typedef float2 dfloat2;
 #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)

 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
-#define TURING_MMA_AVAILABLE
+#define NEW_MMA_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING

-#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
-#define AMPERE_MMA_AVAILABLE
-#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
-
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 #define CP_ASYNC_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
@@ -311,16 +302,12 @@ static bool amd_mfma_available(const int cc) {
 }

 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
-static bool turing_mma_available(const int cc) {
+static bool new_mma_available(const int cc) {
    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 }

-static bool ampere_mma_available(const int cc) {
-    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
-}
-
 static bool cp_async_available(const int cc) {
-    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
+    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
 }

 static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
@@ -424,6 +411,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #endif // FP16_AVAILABLE
 }

+// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
+template<bool norm>
+static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int col = threadIdx.x;
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += blockDim.x) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum);
+
+    if (col != 0) {
+        return;
+    }
+
+    dst[row] = norm ? sum / ncols : sum;
+}
+
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ int warp_reduce_all(int x) {
 #ifdef GGML_USE_HIP
@@ -464,21 +471,25 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
 }

 static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
-#if defined(GGML_USE_HIP)
+#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
    return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
-#elif CUDART_VERSION >= CUDART_HMAX
+#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
    return __hmax2(a, b);
-#else
+#elif !defined(GGML_USE_HIP)
    half2 ret;
    reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a),  __low2float(b)));
    reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
    return ret;
+#else
+    GGML_UNUSED(a);
+    GGML_UNUSED(b);
+    NO_DEVICE_CODE;
 #endif
 }

 template<int width = WARP_SIZE>
 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
 #pragma unroll
   for (int offset = width/2; offset > 0; offset >>= 1) {
       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
@@ -487,7 +498,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #else
   GGML_UNUSED(x);
   NO_DEVICE_CODE;
-#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
+#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
 }

 #if CUDART_VERSION < CUDART_HMASK
@@ -538,24 +549,6 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 #endif // defined(GGML_USE_HIP)
 }

-static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
-#if CUDART_VERSION >= 12080
-    const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x);
-    return (float) e;
-#else
-    uint32_t bits;
-    if (x == 0) {
-        bits = 0x00400000;
-    } else {
-        bits = (uint32_t) x << 23;
-    }
-
-    float result;
-    memcpy(&result, &bits, sizeof(float));
-    return result;
-#endif // CUDART_VERSION >= 12050
-}
-
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);

 static __device__ __forceinline__ float get_alibi_slope(
@@ -614,13 +607,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
    static constexpr int qi = QI8_0;
 };

-template<>
-struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> {
-    static constexpr int qk = QK_MXFP4;
-    static constexpr int qr = QR_MXFP4;
-    static constexpr int qi = QI_MXFP4;
-};
-
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
    static constexpr int qk = QK_K;
@@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
    dequantize_kernel(vx, ib, iqs, v);

    const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
-    y[iy0 + 0]        = ggml_cuda_cast<dst_t>(v.x);
-    y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
+    y[iy0 + 0]        = float(v.x);
+    y[iy0 + y_offset] = float(v.y);
 }

 template <bool need_check>
@@ -465,24 +465,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
    }
 }

-template<typename dst_t>
-static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int64_t i   = blockIdx.x;
-    const block_mxfp4 * x = (const block_mxfp4 *) vx + i*(QK_K/QK_MXFP4);
-
-    const int64_t tid = threadIdx.x;
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
-    const uint8_t  * q4 = x[ib].qs + 4*il;
-    const float d = ggml_cuda_e8m0_to_fp32(x[ib].e);
-    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = d * kvalues_mxfp4[q4[j] & 0xf]*0.5f;
-        y[j+16] = d * kvalues_mxfp4[q4[j] >>  4]*0.5f;
-    }
-}
-
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static void dequantize_block_cuda(const void * vx, dst_t * y,
        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
@@ -606,12 +588,6 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 }

-template<typename dst_t>
-static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
-    const int nb = (k + QK_K - 1) / QK_K;
-    dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y);
-}
-
 template <typename src_t, typename dst_t>
 static __global__ void convert_unary(
        const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
@@ -630,7 +606,7 @@ static __global__ void convert_unary(

    const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
    const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
-    y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
+    y[iy] = float(x[ix]);
 }

 template <typename src_t, typename dst_t>
@@ -701,8 +677,6 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
            return dequantize_row_iq4_xs_cuda;
        case GGML_TYPE_IQ3_S:
            return dequantize_row_iq3_s_cuda;
-        case GGML_TYPE_MXFP4:
-            return dequantize_row_mxfp4_cuda;
        case GGML_TYPE_F32:
            return convert_unary_cont_cuda<float>;
        case GGML_TYPE_BF16:
@@ -752,8 +726,6 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
            return dequantize_row_iq4_xs_cuda;
        case GGML_TYPE_IQ3_S:
            return dequantize_row_iq3_s_cuda;
-        case GGML_TYPE_MXFP4:
-            return dequantize_row_mxfp4_cuda;
        case GGML_TYPE_F16:
            return convert_unary_cont_cuda<half>;
        case GGML_TYPE_BF16:
@@ -29,16 +29,3 @@ typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
 to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
 to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
 to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
-
-template<typename dst_t, typename src_t>
- __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
-    if constexpr (std::is_same_v<dst_t, src_t>) {
-        return x;
-    } else if constexpr(std::is_same_v<dst_t, nv_bfloat16>) {
-        return __float2bfloat16(float(x));
-    } else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
-        return __bfloat162float(x);
-    } else {
-        return float(x);
-    }
-}
@@ -1,7 +1,15 @@
 #pragma once

 #include "ggml-common.h"
-#include "convert.cuh"
+
+template<typename src_t, typename dst_t>
+static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
+    if constexpr (std::is_same_v<src_t, dst_t>) {
+        *dst = *src;
+    } else {
+        *dst = float(*src);
+    }
+}

 static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
    if (x <= val[0]) return 0;
@@ -213,5 +221,5 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {

 template<typename src_t, typename dst_t>
 static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
-    *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
+    convert_flt((const src_t *)cxi, (dst_t *)cdsti);
 }
@@ -15,7 +15,6 @@ typedef void (* fattn_kernel_t)(
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
-        const char * __restrict__ sinks,
        const int  * __restrict__ KV_max,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
@@ -539,15 +538,11 @@ static __global__ void flash_attn_mask_to_KV_max(
        all_inf = warp_reduce_all(all_inf);

        if (!all_inf) {
+            KV_max_sj += FATTN_KQ_STRIDE;
            break;
        }
    }

-    // If the break in the loop was not triggered, KV_max_sj is now -FATTN_KQ_STRIDE.
-    // If the break was triggered it's the lower edge of the tile with the first non-masked values.
-    // In either case, walk back the decrementation by FATTN_KQ_STRIDE.
-    KV_max_sj += FATTN_KQ_STRIDE;
-
    if (threadIdx.x != 0) {
        return;
    }
@@ -741,8 +736,7 @@ void launch_fattn(

    GGML_ASSERT(V || is_mla);

-    const ggml_tensor * mask  = dst->src[3];
-    const ggml_tensor * sinks = dst->src[4];
+    const ggml_tensor * mask = dst->src[3];

    ggml_tensor * KQV = dst;

@@ -946,7 +940,6 @@ void launch_fattn(
        K_data,
        V_data,
        mask ? ((const char *) mask->data) : nullptr,
-        sinks ? ((const char *) sinks->data) : nullptr,
        KV_max.ptr,
        !stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
        scale, max_bias, m0, m1, n_head_log2, logit_softcap,
@@ -418,7 +418,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        float        * const __restrict__ KQ_max,
        float        * const __restrict__ KQ_rowsum,
        const int kb0) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
    typedef fattn_mma_f16_config<DKQ, DV> c;

 #ifdef CP_ASYNC_AVAILABLE
@@ -776,7 +776,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
    GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
    NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
 }

 template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup>
@@ -785,7 +785,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        const half2  * const __restrict__ K_h2,
        const half2  * const __restrict__ V_h2,
        const half2  * const __restrict__ mask_h2,
-        const float  * const __restrict__ sinks_f,
        float2       * const __restrict__ dstk,
        float2       * const __restrict__ dstk_fixup,
        const float scale,
@@ -801,7 +800,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        const int jt,
        const int kb0_start,
        const int kb0_stop) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

    typedef fattn_mma_f16_config<DKQ, DV> c;
@@ -958,52 +957,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        }
    }

-    // If attention sinks are used, potentially re-scale if KQ_max is small.
-    // Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum
-    //     so it's being done unconditionally for every thread.
-    if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) {
-        float KQ_max_scale[cols_per_thread];
-#pragma unroll
-        for (int col = 0; col < cols_per_thread; ++col) {
-            static_assert(ntiles == 1 || ntiles == 2, "ntiles > 2 not implemented");
-            const int jc = ntiles == 1 ? 2*tile_C_VKQ::get_j(col/2) + col % 2 : tile_C_VKQ_16::get_i(col);
-            const float sink = sinks_f[jc % ncols2];
-
-            const float KQ_max_new = fmaxf(KQ_max[col], sink);
-            const float KQ_max_diff = KQ_max[col] - KQ_max_new;
-            KQ_max_scale[col] = expf(KQ_max_diff);
-            KQ_max[col] = KQ_max_new;
-
-            *((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD;
-
-            const float KQ_max_add = expf(sink - KQ_max_new);
-            KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_max_add;
-        }
-
-        if (ntiles == 1) {
-            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
-#pragma unroll
-            for (int i = 0; i < DV/tile_C_VKQ::I; ++i) {
-#pragma unroll
-                for (int l = 0; l < tile_C_VKQ::ne; ++l) {
-                    VKQ_C[i].x[l] *= KQ_max_scale_h2;
-                }
-            }
-        } else {
-#pragma unroll
-            for (int col = 0; col < cols_per_thread; ++col) {
-                const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]);
-#pragma unroll
-                for (int i = 0; i < DV/tile_C_VKQ_16::J; ++i) {
-#pragma unroll
-                    for (int l0 = 0; l0 < tile_C_VKQ_16::ne; l0 += 2) {
-                        VKQ_C_16[i*ntiles/2 + col/2].x[l0 + col % 2] *= KQ_max_scale_h2;
-                    }
-                }
-            }
-        }
-    }
-
    // Combine VKQ accumulator values if np > 1.
    // It's also faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM.
    // So also write VKQ accumulators to shared memory in column-major format if np == 1.
@@ -1243,7 +1196,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
    GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask);
    GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
    NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
 }

 template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla>
@@ -1253,7 +1206,6 @@ static __global__ void flash_attn_ext_f16(
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
-        const char * __restrict__ sinks,
        const int  * __restrict__ KV_max,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
@@ -1270,7 +1222,7 @@ static __global__ void flash_attn_ext_f16(
                            const int32_t nb21, const int32_t nb22, const int64_t nb23,
                            const int32_t ne31, const int32_t ne32, const int32_t ne33,
                            const int32_t nb31, const int32_t nb32, const int64_t nb33) {
-#if defined(FLASH_ATTN_AVAILABLE) && defined(TURING_MMA_AVAILABLE)
+#if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)

    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) {
@@ -1315,24 +1267,20 @@ static __global__ void flash_attn_ext_f16(
    // kb0 == k start index when in the output tile.
    int kb0_start = kbc % iter_k;
    int kb0_stop  = min(iter_k, kb0_start + kbc_stop - kbc);
-
    while (kbc < kbc_stop && kb0_stop == iter_k) {
        const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
-        const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2
-        const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
+        const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
+        const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.

-        const int head0 = zt * ncols2;
-
-        const float2 * Q_f2    = (const float2 *) (Q + nb03*sequence + nb02* head0);
-        const half2  * K_h2    = (const half2  *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
+        const float2 * Q_f2    = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2));
+        const half2  * K_h2    = (const half2  *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio));
        const half2  * mask_h2 = ncols2 == 1 && !mask ? nullptr :
            (const half2  *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1);
-        float2       * dstk    = ((float2 *) dst) + (sequence*ne01*ne02 + head0) * (DV/2);
+        float2       * dstk    = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2);

-        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
-        const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr;
+        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio));

-        const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f;
+        const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;

        const int kb0_start_kernel = kb0_start * kb_niter;
        int       kb0_stop_kernel  = kb0_stop  * kb_niter;
@@ -1345,12 +1293,12 @@ static __global__ void flash_attn_ext_f16(
        if (kb0_start == 0) {
            constexpr bool needs_fixup = false; // CUDA block is working on an entire tile.
            flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
-                (Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
+                (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
                 ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
        } else {
            constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile.
            flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
-                (Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
+                (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
                 ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
        }

@@ -1366,21 +1314,18 @@ static __global__ void flash_attn_ext_f16(
    }

    const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
-    const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2
-    const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
+    const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
+    const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.

-    const int head0 = zt * ncols2;
-
-    const float2 * Q_f2    = (const float2 *) (Q + nb03*sequence + nb02* head0);
-    const half2  * K_h2    = (const half2  *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
+    const float2 * Q_f2    = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2));
+    const half2  * K_h2    = (const half2  *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio));
    const half2  * mask_h2 = ncols2 == 1 && !mask ? nullptr :
        (const half2  *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1);
-    float2       * dstk    = ((float2 *) dst) + (sequence*ne01*ne02 + head0) * (DV/2);
+    float2       * dstk    = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2);

-    const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
-    const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr;
+    const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio));

-    const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f;
+    const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;

    const int kb0_start_kernel = kb0_start * kb_niter;
    int       kb0_stop_kernel  = kb0_stop  * kb_niter;
@@ -1392,10 +1337,10 @@ static __global__ void flash_attn_ext_f16(
    constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
    constexpr bool needs_fixup = false;
    flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
-        (Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
+        (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
         ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
 #else
-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
    GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
@@ -1407,7 +1352,7 @@ static __global__ void flash_attn_ext_f16(
    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
    NO_DEVICE_CODE;
-#endif // defined(FLASH_ATTN_AVAILABLE) && defined(TURING_MMA_AVAILABLE)
+#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 }

 template <int DKQ, int DV, int ncols1, int ncols2>
@@ -13,7 +13,6 @@ static __global__ void flash_attn_tile_ext_f16(
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
-        const char * __restrict__ sinks,
        const int  * __restrict__ KV_max,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
@@ -49,11 +48,10 @@ static __global__ void flash_attn_tile_ext_f16(
    const int sequence = blockIdx.z / ne02;
    const int head = blockIdx.z - sequence*ne02;
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2   = (const float2 *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
-    const half2  * K_h2   = (const half2  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
-    const half2  * V_h2   = (const half2  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
-    const half   * maskh  = (const half   *) (mask  + nb33*(sequence % ne33)                          + nb31*ic0);
-    const float  * sinksf = (const float  *) (sinks);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *) (mask + nb33*(sequence % ne33)                           + nb31*ic0);

    const int stride_KV2 = nb11 / sizeof(half2);

@@ -243,31 +241,6 @@ static __global__ void flash_attn_tile_ext_f16(
        __syncthreads();
    }

-    //Attention sink: adjust running max and sum once per head
-    if (sinksf && blockIdx.y == 0) {
-        const half sink = __float2half(sinksf[head]);
-
-#pragma unroll
-        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-            half kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
-
-            const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new_j));
-            kqmax[j0/nwarps] = kqmax_new_j;
-
-            const half val = hexp(sink - kqmax[j0/nwarps]);
-            kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
-            if (threadIdx.x == 0) {
-                kqsum[j0/nwarps].x = __hadd(kqsum[j0/nwarps].x, val);
-            }
-
-#pragma unroll
-            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
-                VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
-            }
-        }
-    }
-
    float2 * dst2 = (float2 *) dst;

 #pragma unroll
@@ -299,7 +272,7 @@ static __global__ void flash_attn_tile_ext_f16(
        }
    }
 #else
-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
@@ -13,7 +13,6 @@ static __global__ void flash_attn_tile_ext_f32(
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
-        const char * __restrict__ sinks,
        const int  * __restrict__ KV_max,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
@@ -38,7 +37,7 @@ static __global__ void flash_attn_tile_ext_f32(
    return;
 #endif // FP16_MMA_AVAILABLE
    if (use_logit_softcap && !(D == 128 || D == 256)) {
-        GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
+        GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
        GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
        GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
        GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
@@ -60,11 +59,10 @@ static __global__ void flash_attn_tile_ext_f32(
    const int sequence = blockIdx.z / ne02;
    const int head = blockIdx.z - sequence*ne02;
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2   = (const float2 *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
-    const half2  * K_h2   = (const half2  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
-    const half2  * V_h2   = (const half2  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
-    const half   * maskh  = (const half   *) (mask  + nb33*(sequence % ne33)                          + nb31*ic0);
-    const float  * sinksf = (const float  *) (sinks);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *) (mask + nb33*(sequence % ne33)                           + nb31*ic0);

    const int stride_KV2 = nb11 / sizeof(half2);

@@ -253,33 +251,6 @@ static __global__ void flash_attn_tile_ext_f32(
        __syncthreads();
    }

-
-    //Attention sink: adjust running max and sum once per head
-    if (sinksf && blockIdx.y == 0) {
-        const float sink = sinksf[head];
-
-#pragma unroll
-        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-            float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
-
-            const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
-            kqmax[j0/nwarps] = kqmax_new_j;
-
-            const float val = expf(sink - kqmax[j0/nwarps]);
-            kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
-            if (threadIdx.x == 0) {
-                kqsum[j0/nwarps] += val;
-            }
-
-#pragma unroll
-            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
-                VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
-                VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
-            }
-        }
-    }
-
    float2 * dst2 = (float2 *) dst;

 #pragma unroll
@@ -16,7 +16,6 @@ static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
-        const char * __restrict__ sinks,
        const int  * __restrict__ KV_max,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
@@ -62,8 +61,7 @@ static __global__ void flash_attn_vec_ext_f16(
    K += nb13*sequence + nb12*(head / gqa_ratio);
    V += nb23*sequence + nb22*(head / gqa_ratio);

-    const half  * maskh  = (const half  *) (mask + nb33*(sequence % ne33) + nb31*ic0);
-    const float * sinksf = (const float *) (sinks);
+    const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);

    const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
@@ -77,12 +75,11 @@ static __global__ void flash_attn_vec_ext_f16(
    half2 * KQ2 = (half2 *) KQ;

    half kqmax[ncols];
-    half kqsum[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -HALF_MAX_HALF;
-        kqsum[j] = 0.0f;
    }
+    half kqsum[ncols] = {0.0f};

    __shared__ half kqmax_shared[ncols][WARP_SIZE];
    __shared__ half kqsum_shared[ncols][WARP_SIZE];
@@ -286,39 +283,6 @@ static __global__ void flash_attn_vec_ext_f16(
        __syncthreads();
    }

-    if (sinksf && blockIdx.y == 0) {
-        const half sink = __float2half(sinksf[head]);
-
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            if (threadIdx.x == 0) {
-                kqmax_shared[j][threadIdx.y] = fmaxf(kqmax[j], sink);
-            }
-        }
-
-        __syncthreads();
-
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
-
-            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
-            kqmax[j] = kqmax_new_j;
-
-            const half val = hexp(sink - kqmax[j]);
-            kqsum[j] = kqsum[j]*KQ_max_scale;
-
-            if (tid == 0) {
-                kqsum[j] += val;
-            }
-
-            VKQ[j] *= __half2half2(KQ_max_scale);
-        }
-
-        __syncthreads();
-    }
-
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum((float)kqsum[j]);
@@ -349,7 +313,7 @@ static __global__ void flash_attn_vec_ext_f16(
        dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
    GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
@@ -16,7 +16,6 @@ static __global__ void flash_attn_vec_ext_f32(
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
-        const char * __restrict__ sinks,
        const int  * __restrict__ KV_max,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
@@ -73,8 +72,7 @@ static __global__ void flash_attn_vec_ext_f32(
    K += nb13*sequence + nb12*(head / gqa_ratio);
    V += nb23*sequence + nb22*(head / gqa_ratio);

-    const half  * maskh  = (const half  *) (mask + nb33*(sequence % ne33) + nb31*ic0);
-    const float * sinksf = (const float *) (sinks);
+    const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);

    const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);

@@ -90,12 +88,11 @@ static __global__ void flash_attn_vec_ext_f32(
    }

    float kqmax[ncols];
-    float kqsum[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -FLT_MAX/2.0f;
-        kqsum[j] = 0.0f;
    }
+    float kqsum[ncols] = {0.0f};

    __shared__ float kqmax_shared[ncols][WARP_SIZE];
    __shared__ float kqsum_shared[ncols][WARP_SIZE];
@@ -282,39 +279,6 @@ static __global__ void flash_attn_vec_ext_f32(
        __syncthreads();
    }

-    if (sinksf && blockIdx.y == 0) {
-        const float sink = sinksf[head];
-
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            if (threadIdx.x == 0) {
-                kqmax_shared[j][threadIdx.y] = fmaxf(kqmax[j], sink);
-            }
-        }
-
-        __syncthreads();
-
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            float kqmax_new_j = kqmax_shared[j][threadIdx.x];
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
-
-            const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
-            kqmax[j] = kqmax_new_j;
-
-            const float val = expf(sink - kqmax[j]);
-            kqsum[j] = kqsum[j]*KQ_max_scale;
-
-            if (tid == 0) {
-                kqsum[j] += val;
-            }
-
-            VKQ[j] *= KQ_max_scale;
-        }
-
-        __syncthreads();
-    }
-
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum(kqsum[j]);
@@ -15,6 +15,7 @@ namespace wmma = mtmusa::wmma;
 namespace wmma = nvcuda::wmma;
 #endif // GGML_USE_MUSA
 #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
+#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
 #include <rocwmma/rocwmma.hpp>
 namespace wmma = rocwmma;
 #endif // !defined(GGML_USE_HIP)
@@ -28,7 +29,6 @@ static __global__ void flash_attn_ext_f16(
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
-        const char * __restrict__ sinks,
        const int  * __restrict__ KV_max,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
@@ -81,12 +81,11 @@ static __global__ void flash_attn_ext_f16(
    const int sequence = blockIdx.z / ne02;
    const int head = blockIdx.z - sequence*ne02;
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float * Q_f    = (const float *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
-    const half  * K_h    = (const half  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
-    const half  * V_h    = (const half  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
-    const half  * maskh  = (const half  *) (mask + nb33*(sequence % ne33)                           + nb31*ic0);
-    const half2 * mask2  = (const half2 *)  maskh;
-    const float * sinksf = (const float *) sinks;
+    const float * Q_f   = (const float *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
+    const half  * K_h   = (const half  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
+    const half  * V_h   = (const half  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
+    const half  * maskh = (const half  *) (mask + nb33*(sequence % ne33)                           + nb31*ic0);
+    const half2 * mask2 = (const half2 *)  maskh;

    const int stride_Q  = nb01 / sizeof(float);
    const int stride_KV = nb11 / sizeof(half);
@@ -381,53 +380,6 @@ static __global__ void flash_attn_ext_f16(
        __syncthreads();
    }

-    // Apply attention sinks
-    if (sinksf && blockIdx.y == 0) {
-        const float sinkf = sinksf[head];
-        const half  sinkh = __float2half(sinkf);
-
-#pragma unroll
-        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-            const int j = j0 + threadIdx.y;
-
-            if (std::is_same<KQ_acc_t, float>::value) {
-                float kqmax_new = fmaxf(KQ_max_f[j0/nwarps], sinkf);
-
-                const float KQ_max_scale = expf(KQ_max_f[j0/nwarps] - kqmax_new);
-                KQ_max_f[j0/nwarps] = kqmax_new;
-
-                KQ_rowsum_f[j0/nwarps] = KQ_rowsum_f[j0/nwarps] * KQ_max_scale + expf(sinkf - KQ_max_f[j0/nwarps]);
-
-                const half2 scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
-#pragma unroll
-                for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-                    const int i = i0 + threadIdx.x;
-                    if (i0 + warp_size > D/2 && i >= D/2) break;
-                    VKQ2[j*(D_padded/2) + i] *= scale_h2;
-                }
-            } else {
-                half kqmax_old = __low2half(KQ_max_h2[j0/nwarps]);
-                half kqmax_new = fmaxf(kqmax_old, sinkh);
-                KQ_max_h2[j0/nwarps] = __half2half2(kqmax_new);
-
-                const half  KQ_max_scale_h = hexp(kqmax_old - kqmax_new);
-                const half2 KQ_max_scale   = __half2half2(KQ_max_scale_h);
-
-                KQ_rowsum_h2[j0/nwarps] = KQ_rowsum_h2[j0/nwarps] * KQ_max_scale;
-                const half val = hexp(sinkh - kqmax_new);
-                KQ_rowsum_h2[j0/nwarps].x = __hadd(KQ_rowsum_h2[j0/nwarps].x, val);
-
-#pragma unroll
-                for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-                    const int i = i0 + threadIdx.x;
-                    if (i0 + warp_size > D/2 && i >= D/2) break;
-                    VKQ2[j*(D_padded/2) + i] *= KQ_max_scale;
-                }
-            }
-        }
-
-        __syncthreads();
-    }
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j_VKQ = j0 + threadIdx.y;
@@ -471,7 +423,7 @@ static __global__ void flash_attn_ext_f16(
        dst_meta[j_dst_unrolled] = dst_meta_val;
    }
 #else
-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
@@ -269,11 +269,11 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
 }

 void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * KQV   = dst;
-    const ggml_tensor * Q     = dst->src[0];
-    const ggml_tensor * K     = dst->src[1];
-    const ggml_tensor * V     = dst->src[2];
-    const ggml_tensor * mask  = dst->src[3];
+    const ggml_tensor * KQV  = dst;
+    const ggml_tensor * Q    = dst->src[0];
+    const ggml_tensor * K    = dst->src[1];
+    const ggml_tensor * V    = dst->src[2];
+    const ggml_tensor * mask = dst->src[3];

    ggml_cuda_set_device(ctx.device);
    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
@@ -316,7 +316,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
    const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
    const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (Q->ne[2] > 4*K->ne[2] && K->ne[1] >= 8192);
-    const bool mma_faster_for_bs1 = turing_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
+    const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
        (cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
@@ -329,7 +329,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    }

    // The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
-    if (fp16_mma_available(cc) && !turing_mma_available(cc)) {
+    if (fp16_mma_available(cc) && !new_mma_available(cc)) {
        ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
        return;
    }
@@ -1,6 +1,5 @@
 #include "getrows.cuh"
 #include "dequantize.cuh"
-#include "convert.cuh"

 template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void k_get_rows(
@@ -35,8 +34,8 @@ static __global__ void k_get_rows(
    dfloat2 v;
    dequantize_kernel(src0_row, ib, iqs, v);

-    dst_row[iybs + iqs + 0]        = ggml_cuda_cast<dst_t>(v.x);
-    dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
+    dst_row[iybs + iqs + 0]        = float(v.x);
+    dst_row[iybs + iqs + y_offset] = float(v.y);
 }

 template<typename src0_t, typename dst_t>
@@ -63,7 +62,7 @@ static __global__ void k_get_rows_float(
    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
    const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);

-    dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
+    dst_row[i00] = float(src0_row[i00]);
 }

 template<typename grad_t, typename dst_t>
@@ -4,7 +4,6 @@

 #include "ggml-cuda/common.cuh"
 #include "ggml-cuda/acc.cuh"
-#include "ggml-cuda/add-id.cuh"
 #include "ggml-cuda/arange.cuh"
 #include "ggml-cuda/argmax.cuh"
 #include "ggml-cuda/argsort.cuh"
@@ -22,13 +21,11 @@
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
-#include "ggml-cuda/mmf.cuh"
 #include "ggml-cuda/mmq.cuh"
-#include "ggml-cuda/mmvf.cuh"
+#include "ggml-cuda/mmv.cuh"
 #include "ggml-cuda/mmvq.cuh"
 #include "ggml-cuda/norm.cuh"
 #include "ggml-cuda/opt-step-adamw.cuh"
-#include "ggml-cuda/opt-step-sgd.cuh"
 #include "ggml-cuda/out-prod.cuh"
 #include "ggml-cuda/pad.cuh"
 #include "ggml-cuda/pool2d.cuh"
@@ -181,6 +178,30 @@ static int ggml_cuda_parse_id(char devName[]) {
 #endif // defined(GGML_USE_HIP)

 static ggml_cuda_device_info ggml_cuda_init() {
+#if defined(GGML_USE_HIP)
+    // Workaround for a rocBLAS bug when using multiple graphics cards:
+    // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+    {
+        int major_version = 0;
+        size_t version_length = 0;
+        if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
+            std::vector<char> version(version_length+1, '\0');
+            if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
+                version.resize(::strlen(version.data()));
+                int parsed_value = 0;
+                if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
+                    major_version = parsed_value;
+                }
+            }
+        }
+        if (major_version < 4) {
+            GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
+            rocblas_initialize();
+            CUDA_CHECK(cudaDeviceSynchronize());
+        }
+    }
+#endif
+
    ggml_cuda_device_info info = {};

    cudaError_t err = cudaGetDeviceCount(&info.device_count);
@@ -1986,9 +2007,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
        && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;

-    bool use_mul_mat_vec_f = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-    bool use_mul_mat_f     = !ggml_is_quantized(src0->type)
+    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
@@ -2008,18 +2027,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
            }

            const int cc            = ggml_cuda_info().devices[id].cc;
-            const int warp_size     = ggml_cuda_info().devices[id].warp_size;
            use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-            use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
-            use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
+            use_mul_mat_vec         = use_mul_mat_vec           && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
            any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
        }
    } else {
        const int cc            = ggml_cuda_info().devices[ctx.device].cc;
-        const int warp_size     = ggml_cuda_info().devices[ctx.device].warp_size;
        use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-        use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
-        use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
+        use_mul_mat_vec         = use_mul_mat_vec           && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
        any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
    }

@@ -2032,17 +2047,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);

    //TODO update for generic tensor parallelism
-    const int cc                 = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const int cc                     = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    bool use_batched_cublas_f16  = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
    bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
    bool use_batched_cublas_f32  = src0->type == GGML_TYPE_F32;

-    if (!split && use_mul_mat_vec_f) {
+    if (!split && use_mul_mat_vec) {
        // the custom F16 vector kernel can be used over batched cuBLAS GEMM
        // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
-        ggml_cuda_mul_mat_vec_f(ctx, src0, src1, nullptr, dst);
-    } else if (!split && use_mul_mat_f) {
-        ggml_cuda_mul_mat_f(ctx, src0, src1, nullptr, dst);
+        ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
    } else if (!split && use_mul_mat_vec_q) {
        ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
    } else if (!split && use_mul_mat_q) {
@@ -2051,8 +2064,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
        && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // general KQ + KQV multi-batch without FlashAttention
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
-    } else if (use_mul_mat_vec_f) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f, nullptr);
+    } else if (use_mul_mat_vec) {
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
    } else if (use_mul_mat_vec_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
    } else if (use_mul_mat_q) {
@@ -2080,7 +2093,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
            if (ggml_is_quantized(src0->type)) {
                ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
            } else {
-                ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
+                ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
            }
            return;
        }
@@ -2246,9 +2259,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_ADD1: // TODO: more efficient implementation
            ggml_cuda_op_add(ctx, dst);
            break;
-        case GGML_OP_ADD_ID:
-            ggml_cuda_op_add_id(ctx, dst);
-            break;
        case GGML_OP_SUB:
            ggml_cuda_op_sub(ctx, dst);
            break;
@@ -2323,9 +2333,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_GLU_OP_SWIGLU:
                    ggml_cuda_op_swiglu(ctx, dst);
                    break;
-                case GGML_GLU_OP_SWIGLU_OAI:
-                    ggml_cuda_op_swiglu_oai(ctx, dst);
-                    break;
                case GGML_GLU_OP_GEGLU_ERF:
                    ggml_cuda_op_geglu_erf(ctx, dst);
                    break;
@@ -2480,9 +2487,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_OPT_STEP_ADAMW:
            ggml_cuda_opt_step_adamw(ctx, dst);
            break;
-        case GGML_OP_OPT_STEP_SGD:
-            ggml_cuda_opt_step_sgd(ctx, dst);
-            break;
        default:
            return false;
    }
@@ -2603,9 +2607,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud

    const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
    const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
-    const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
-    const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
-    const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";

    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];
@@ -2628,13 +2629,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
        }

-        if (node->op == GGML_OP_ADD &&
-            node->src[1] && node->src[1]->ne[1] > 1 &&
-            (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
-            (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
-            strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
-            strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
-            strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0) {
+        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
            // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
            // by means of matching node names. See
            // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
@@ -3232,7 +3227,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_GLU_OP_REGLU:
                case GGML_GLU_OP_GEGLU:
                case GGML_GLU_OP_SWIGLU:
-                case GGML_GLU_OP_SWIGLU_OAI:
                case GGML_GLU_OP_GEGLU_ERF:
                case GGML_GLU_OP_GEGLU_QUICK:
                    return ggml_is_contiguous_1(op->src[0]);
@@ -3283,7 +3277,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
-                    case GGML_TYPE_MXFP4:
                    case GGML_TYPE_Q2_K:
                    case GGML_TYPE_Q3_K:
                    case GGML_TYPE_Q4_K:
@@ -3430,7 +3423,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_ADD:
-        case GGML_OP_ADD_ID:
        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
@@ -3505,17 +3497,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 #endif // FLASH_ATTN_AVAILABLE
            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
                const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
-                if (!turing_mma_available(cc)) {
+                if (!new_mma_available(cc)) {
                    return false;
                }
                const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
                return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
            }
-            // TODO: more general-purpose attention sink support [TAG_ATTN_SINKS]
-            if (op->src[4] && !fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc)
-                    && op->src[0]->ne[0] != 64 && op->src[0]->ne[0] != 128) {
-                return false;
-            }
            if (op->src[0]->ne[0] == 192) {
                return false;
            }
@@ -3540,7 +3527,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CROSS_ENTROPY_LOSS:
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
        case GGML_OP_OPT_STEP_ADAMW:
-        case GGML_OP_OPT_STEP_SGD:
            return true;
        default:
            return false;
@@ -3780,10 +3766,10 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
    }

    ggml_backend_t cuda_backend = new ggml_backend {
-        /* .guid    = */ ggml_backend_cuda_guid(),
-        /* .iface   = */ ggml_backend_cuda_interface,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
-        /* .context = */ ctx,
+        /* .guid      = */ ggml_backend_cuda_guid(),
+        /* .interface = */ ggml_backend_cuda_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
+        /* .context   = */ ctx,
    };

    return cuda_backend;
@@ -1,5 +1,7 @@
 #include "im2col.cuh"

+#define MIN(a, b) (a) < (b) ? (a) : (b)
+
 #define MAX_GRIDDIM_Z 65535

 template <typename T>
@@ -36,9 +38,6 @@ static  __global__ void im2col_kernel(
            dst[offset_dst] = x[offset_src + iih * IW + iiw];
        }
    }
-
-    GGML_UNUSED(IC);
-    GGML_UNUSED(KH);
 }

 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -1,14 +1,4 @@
 #include "mean.cuh"
-#include "reduce_rows.cuh"
-
-#ifdef GGML_CUDA_USE_CUB
-#include <cub/cub.cuh>
-using namespace cub;
-#endif  // GGML_CUDA_USE_CUB
-
-template <typename T> __global__ void divide_by_count(T * result, size_t count) {
-    *result /= static_cast<T>(count);
-}

 void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0   = dst->src[0];
@@ -23,51 +13,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t ncols = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

-// Special case for reducing vectors
-#ifdef GGML_CUDA_USE_CUB
-#ifdef USE_CUDA_GRAPH
-    cudaStreamCaptureStatus iscapturing;
-    CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
-#endif // USE_CUDA_GRAPH
-    if ((nrows == 1) &&
-#ifdef USE_CUDA_GRAPH
-            // CUDA_GRAPHS_DISABLED
-            ((ncols > 65536) &&
-             ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
-              ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
-              ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
-        // CUDA_GRAPHS ENABLED
-        ((ncols > 32768) &&
-         !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
-           ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
-           ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
-#else
-        (ncols > 65536)) {
-#endif // USE_CUDA_GRAPH
-        // Single row - use device-wide reduction
-        size_t           tmp_size = 0;
-        ggml_cuda_pool & pool     = ctx.pool();
-
-        DeviceReduce::Sum(nullptr, tmp_size, src0_d, dst_d, ncols, stream);
-
-        ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
-        DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, src0_d, dst_d, ncols, stream);
-
-        // Divide by ncols
-        divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
-        return;
-    }
-#endif // GGML_CUDA_USE_CUB
-
+    const dim3 block_dims(WARP_SIZE, 1, 1);
    const dim3 block_nums(nrows, 1, 1);
-
-    const int id  = ggml_cuda_get_device();
-    const int nsm = ggml_cuda_info().devices[id].nsm;
-    if ((nrows / nsm) < 2) {
-        const dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
-    } else {
-        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
-        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
-    }
+    reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
 }
@@ -23,13 +23,13 @@
 static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
    int ret = 0;

-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
    asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
        : "=r"(ret) : "r"(x));
 #else
    GGML_UNUSED(x);
    NO_DEVICE_CODE;
-#endif // defined(TURING_MMA_AVAILABLE)
+#endif // defined(NEW_MMA_AVAILABLE)
    return ret;
 }

@@ -167,38 +167,6 @@ namespace ggml_cuda_mma {
        }
    };

-    template <int I_, int J_>
-    struct tile<I_, J_, nv_bfloat162> {
-        static constexpr int I  = I_;
-        static constexpr int J  = J_;
-        static constexpr int ne = I * J / WARP_SIZE;
-        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
-
-        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 8 && J == 8) {
-                return threadIdx.x / 4;
-            } else if constexpr (I == 16 && J == 4) {
-                return l * 8 + threadIdx.x / 4;
-            } else if constexpr (I == 16 && J == 8) {
-                return (l % 2) * 8 + threadIdx.x / 4;
-            } else {
-                static_assert(I == -1 && J == -1, "template specialization not implemented");
-            }
-        }
-
-        static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 8 && J == 8) {
-                return l * 4 + threadIdx.x % 4;
-            } else if constexpr (I == 16 && J == 4) {
-                return threadIdx.x % 4;
-            } else if constexpr (I == 16 && J == 8) {
-                return (l / 2) * 4 + threadIdx.x % 4;
-            } else {
-                static_assert(I == -1 && J == -1, "template specialization not implemented");
-            }
-        }
-    };
-
    template <int I, int J>
    static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
        tile<I, J/2, half2> ret;
@@ -241,7 +209,7 @@ namespace ggml_cuda_mma {
    template <typename T>
    static __device__ __forceinline__ void load_ldmatrix(
            tile<8, 8, T> & t, const T * __restrict__ xs0, const int stride) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
        int * xi = (int *) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + ((threadIdx.x / t.I) * (t.J / 2)) % t.J;
        asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
@@ -249,13 +217,13 @@ namespace ggml_cuda_mma {
            : "l"(xs));
 #else
        load_generic(t, xs0, stride);
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    template <typename T>
    static __device__ __forceinline__ void load_ldmatrix(
            tile<16, 4, T> & t, const T * __restrict__ xs0, const int stride) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
        int * xi = (int *) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride;
        asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
@@ -264,13 +232,13 @@ namespace ggml_cuda_mma {
 #else
        load_generic(xs0, stride);
        GGML_UNUSED(t);
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    template <typename T>
    static __device__ __forceinline__ void load_ldmatrix(
            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
-#if defined(TURING_MMA_AVAILABLE)
+#if defined(NEW_MMA_AVAILABLE)
        int * xi = (int * ) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
        asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
@@ -278,13 +246,13 @@ namespace ggml_cuda_mma {
            : "l"(xs));
 #else
        load_generic(t, xs0, stride);
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    template <typename T>
    static __device__ __forceinline__ void load_ldmatrix_trans(
            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
        int * xi = (int * ) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
        asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
@@ -295,12 +263,12 @@ namespace ggml_cuda_mma {
        GGML_UNUSED(xs0);
        GGML_UNUSED(stride);
        NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    static __device__ __forceinline__ void mma(
            tile<16, 8, int> & D, const tile<16, 4, int> & A, const tile<8, 4, int> & B) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
        asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
            : "+r"(D.x[0]), "+r"(D.x[1]), "+r"(D.x[2]), "+r"(D.x[3])
@@ -319,12 +287,12 @@ namespace ggml_cuda_mma {
        GGML_UNUSED(A);
        GGML_UNUSED(B);
        NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    static __device__ __forceinline__ void mma(
            tile<16, 8, int> & D, const tile<16, 8, int> & A, const tile<8, 8, int> & B) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
        asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
            : "+r"(D.x[0]), "+r"(D.x[1]), "+r"(D.x[2]), "+r"(D.x[3])
@@ -349,12 +317,12 @@ namespace ggml_cuda_mma {
        GGML_UNUSED(A);
        GGML_UNUSED(B);
        NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    static __device__ __forceinline__ void mma(
            tile<16, 4, half2> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
        int       * Dxi = (int       *) D.x;
@@ -376,12 +344,12 @@ namespace ggml_cuda_mma {
        GGML_UNUSED(A);
        GGML_UNUSED(B);
        NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    static __device__ __forceinline__ void mma(
            tile<16, 8, half2> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
        int       * Dxi = (int       *) D.x;
@@ -412,29 +380,12 @@ namespace ggml_cuda_mma {
        GGML_UNUSED(A);
        GGML_UNUSED(B);
        NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
-    }
-
-    static __device__ __forceinline__ void mma(
-            tile<16, 8, float> & D, const tile<16, 8, float> & A, const tile<8, 8, float> & B) {
-#ifdef AMPERE_MMA_AVAILABLE
-        const int * Axi = (const int *) A.x;
-        const int * Bxi = (const int *) B.x;
-        int       * Dxi = (int       *) D.x;
-        asm("mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
-            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
-            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]));
-#else
-        GGML_UNUSED(D);
-        GGML_UNUSED(A);
-        GGML_UNUSED(B);
-        NO_DEVICE_CODE;
-#endif // AMPERE_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    static __device__ __forceinline__ void mma(
            tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
        int       * Dxi = (int       *) D.x;
@@ -456,29 +407,12 @@ namespace ggml_cuda_mma {
        GGML_UNUSED(A);
        GGML_UNUSED(B);
        NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
-    }
-
-    static __device__ __forceinline__ void mma(
-            tile<16, 8, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<8, 8, nv_bfloat162> & B) {
-#ifdef AMPERE_MMA_AVAILABLE
-        const int * Axi = (const int *) A.x;
-        const int * Bxi = (const int *) B.x;
-        int       * Dxi = (int       *) D.x;
-        asm("mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
-            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
-            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]));
-#else
-        GGML_UNUSED(D);
-        GGML_UNUSED(A);
-        GGML_UNUSED(B);
-        NO_DEVICE_CODE;
-#endif // AMPERE_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    static __device__ __forceinline__ void mma(
            tile<16, 16, float> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
-#ifdef TURING_MMA_AVAILABLE
+#ifdef NEW_MMA_AVAILABLE
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
        int       * Dxi = (int       *) D.x;
@@ -509,7 +443,7 @@ namespace ggml_cuda_mma {
        GGML_UNUSED(A);
        GGML_UNUSED(B);
        NO_DEVICE_CODE;
-#endif // TURING_MMA_AVAILABLE
+#endif // NEW_MMA_AVAILABLE
    }

    static __device__ __forceinline__ void mma(
@@ -1,431 +0,0 @@
-#include "ggml.h"
-#include "common.cuh"
-#include "mma.cuh"
-#include "mmf.cuh"
-
-using namespace ggml_cuda_mma;
-
-#define MMF_ROWS_PER_BLOCK 32
-
-template <typename T, int rows_per_block, int cols_per_block, int nwarps>
-__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
-static __global__ void mul_mat_f(
-        const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
-        const int ncols, const int nchannels_y, const int stride_row, const int stride_col_y, const int stride_col_dst,
-        const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
-        const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    typedef tile<16, 8, T>     tile_A;
-    typedef tile< 8, 8, T>     tile_B;
-    typedef tile<16, 8, float> tile_C;
-
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
-    constexpr int tile_k_padded = warp_size + 4;
-    constexpr int ntA = rows_per_block / tile_A::I;
-    constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I;
-
-    const int row0        = blockIdx.x * rows_per_block;
-    const int channel_dst = blockIdx.y;
-    const int channel_x   = channel_dst / channel_ratio;
-    const int channel_y   = channel_dst;
-    const int sample_dst  = blockIdx.z;
-    const int sample_x    = sample_dst / sample_ratio;
-    const int sample_y    = sample_dst;
-
-    x   += int64_t(sample_x)  *stride_sample_x   + channel_x  *stride_channel_x   + row0*stride_row ;
-    y   += int64_t(sample_y)  *stride_sample_y   + channel_y  *stride_channel_y;
-    dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
-
-    const float2 * y2 = (const float2 *) y;
-
-    extern __shared__ char data_mmv[];
-
-    tile_C C[ntA][ntB];
-
-    T * tile_xy = (T *) data_mmv + threadIdx.y*(tile_A::I * tile_k_padded);
-
-    for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) {
-        tile_A A[ntA][warp_size / tile_A::J];
-#pragma unroll
-        for (int itA = 0; itA < ntA; ++itA) {
-#pragma unroll
-            for (int i = 0; i < tile_A::I; ++i) {
-                tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row  + col];
-            }
-#pragma unroll
-            for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
-                load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
-            }
-        }
-
-#pragma unroll
-        for (int itB = 0; itB < ntB; ++itB) {
-            if constexpr (std::is_same_v<T, float>) {
-#pragma unroll
-                for (int j0 = 0; j0 < tile_B::I; ++j0) {
-                    const int j = j0 + itB*tile_B::I;
-
-                    tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
-                }
-            } else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-#pragma unroll
-                for (int j0 = 0; j0 < tile_B::I; ++j0) {
-                    const int j = j0 + itB*tile_B::I;
-
-                    const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
-                    tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
-                }
-            } else {
-                static_assert(std::is_same_v<T, void>, "unsupported type");
-            }
-#pragma unroll
-            for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
-                tile_B B;
-                load_ldmatrix(B, tile_xy + k0, tile_k_padded);
-#pragma unroll
-                for (int itA = 0; itA < ntA; ++itA) {
-                    mma(C[itA][itB], A[itA][k0/tile_B::J], B);
-                }
-            }
-        }
-    }
-
-    float * buf_iw = (float *) data_mmv;
-    constexpr int kiw = nwarps*rows_per_block + 4;
-
-    if (nwarps > 1) {
-        __syncthreads();
-    }
-#pragma unroll
-    for (int itB = 0; itB < ntB; ++itB) {
-#pragma unroll
-        for (int itA = 0; itA < ntA; ++itA) {
-#pragma unroll
-            for (int l = 0; l < tile_C::ne; ++l) {
-                const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l);
-                const int j = itB*tile_C::J + tile_C::get_j(l);
-                buf_iw[j*kiw + i] = C[itA][itB].x[l];
-            }
-        }
-    }
-
-    if (nwarps > 1) {
-        __syncthreads();
-    }
-
-#pragma unroll
-    for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
-        const int j = j0 + threadIdx.y;
-
-        if (j0 + nwarps > cols_per_block && j >= cols_per_block) {
-            return;
-        }
-
-        float sum = 0.0f;
-        static_assert(rows_per_block == warp_size, "need loop/check");
-#pragma unroll
-        for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) {
-            const int i = i0 + threadIdx.x;
-
-            sum += buf_iw[j*kiw + i];
-        }
-        dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
-    }
-#else
-    NO_DEVICE_CODE;
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(ids); GGML_UNUSED(dst);
-    GGML_UNUSED(ncols); GGML_UNUSED(nchannels_y); GGML_UNUSED(stride_row); GGML_UNUSED(stride_col_y); GGML_UNUSED(stride_col_dst);
-    GGML_UNUSED(channel_ratio); GGML_UNUSED(stride_channel_x); GGML_UNUSED(stride_channel_y); GGML_UNUSED(stride_channel_dst);
-    GGML_UNUSED(sample_ratio); GGML_UNUSED(stride_sample_x); GGML_UNUSED(stride_sample_y); GGML_UNUSED(stride_sample_dst);
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-}
-
-template <typename T, int cols_per_block>
-static void mul_mat_f_cuda(
-        const T * x, const float * y, const int32_t * ids, float * dst,
-        const int64_t ncols_x, const int64_t nrows_x,
-        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
-        const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
-        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
-        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
-        cudaStream_t stream) {
-    typedef tile<16, 8, T>     tile_A;
-    typedef tile< 8, 8, T>     tile_B;
-    typedef tile<16, 8, float> tile_C;
-
-    GGML_ASSERT(!ids && "mul_mat_id not implemented");
-
-    GGML_ASSERT(ncols_x      % 2 == 0);
-    GGML_ASSERT(stride_row   % 2 == 0);
-    GGML_ASSERT(stride_col_y % 2 == 0);
-    GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
-    GGML_ASSERT(       nsamples_dst  % nsamples_x  == 0);
-    const int64_t channel_ratio = nchannels_dst / nchannels_x;
-    const int64_t sample_ratio  = nsamples_dst  / nsamples_x;
-
-    const int device = ggml_cuda_get_device();
-    const int warp_size = ggml_cuda_info().devices[device].warp_size;
-
-    int64_t nwarps_best     = 1;
-    int64_t niter_best      = (ncols_x + warp_size*2 - 1) / (warp_size*2);
-    int64_t max_block_size  = 256;
-    for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) {
-        const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2);
-        if (niter < niter_best) {
-            niter_best  = niter;
-            nwarps_best = nwarps;
-        }
-    }
-
-    constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
-    const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4;
-    const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
-    const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
-    const dim3 block_nums(nrows_x/rows_per_block, nchannels_dst, nsamples_dst);
-    const dim3 block_dims(warp_size, nwarps_best, 1);
-    switch (nwarps_best) {
-        case 1: {
-            mul_mat_f<T, rows_per_block, cols_per_block, 1><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case 2: {
-            mul_mat_f<T, rows_per_block, cols_per_block, 2><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case 3: {
-            mul_mat_f<T, rows_per_block, cols_per_block, 3><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case 4: {
-            mul_mat_f<T, rows_per_block, cols_per_block, 4><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case 5: {
-            mul_mat_f<T, rows_per_block, cols_per_block, 5><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case 6: {
-            mul_mat_f<T, rows_per_block, cols_per_block, 6><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case 7: {
-            mul_mat_f<T, rows_per_block, cols_per_block, 7><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case 8: {
-            mul_mat_f<T, rows_per_block, cols_per_block, 8><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        default: {
-            GGML_ABORT("fatal error");
-        } break;
-    }
-}
-
-template <typename T>
-static void mul_mat_f_switch_cols_per_block(
-        const T * x, const float * y, const int32_t * ids, float * dst,
-        const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
-        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
-        const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
-        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
-        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
-        cudaStream_t stream) {
-    switch (ncols_dst) {
-        case  1: {
-            mul_mat_f_cuda<T,  1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case  2: {
-            mul_mat_f_cuda<T,  2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case  3: {
-            mul_mat_f_cuda<T,  3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case  4: {
-            mul_mat_f_cuda<T,  4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case  5: {
-            mul_mat_f_cuda<T,  5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case  6: {
-            mul_mat_f_cuda<T,  6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case  7: {
-            mul_mat_f_cuda<T,  7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case  8: {
-            mul_mat_f_cuda<T,  8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case  9: {
-            mul_mat_f_cuda<T,  9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case 10: {
-            mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case 11: {
-            mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case 12: {
-            mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case 13: {
-            mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case 14: {
-            mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case 15: {
-            mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        case 16: {
-            mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y,  nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x,                nsamples_dst,  stride_sample_x,  stride_sample_y,  stride_sample_dst,  stream);
-        } break;
-        default: {
-            GGML_ABORT("fatal error");
-        } break;
-    }
-}
-
-void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
-    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(!ids ||  ids->type == GGML_TYPE_I32);
-    GGML_ASSERT(         dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const size_t ts_src0 = ggml_type_size(src0->type);
-    const size_t ts_src1 = ggml_type_size(src1->type);
-    const size_t ts_dst  = ggml_type_size(dst->type);
-
-    GGML_ASSERT(ne13 == ne3);
-
-    GGML_ASSERT(        nb00       == ts_src0);
-    GGML_ASSERT(        nb10       == ts_src1);
-    GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
-    GGML_ASSERT(        nb0        == ts_dst);
-
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
-
-    const float   * src1_d =       (const float   *) src1->data;
-    const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
-    float         *  dst_d =       (float         *)  dst->data;
-
-    const int64_t s01 = src0->nb[1] / ts_src0;
-    const int64_t s11 = src1->nb[1] / ts_src1;
-    const int64_t s1  =  dst->nb[1] / ts_dst;
-    const int64_t s02 = src0->nb[2] / ts_src0;
-    const int64_t s12 = src1->nb[2] / ts_src1;
-    const int64_t s2  =  dst->nb[2] / ts_dst;
-    const int64_t s03 = src0->nb[3] / ts_src0;
-    const int64_t s13 = src1->nb[3] / ts_src1;
-    const int64_t s3  =  dst->nb[3] / ts_dst;
-
-    // For MUL_MAT_ID the memory layout is different than for MUL_MAT:
-    const int64_t ncols_dst          = ids ? ne2  : ne1;
-    const int64_t nchannels_y        = ids ? ne11 : ne12;
-    const int64_t nchannels_dst      = ids ? ne1  : ne2;
-    const int64_t stride_channel_dst = ids ? s1   : s2;
-    const int64_t stride_channel_y   = ids ? s11  : s12;
-
-    GGML_ASSERT(!ids || ncols_dst == 1);
-
-    switch (src0->type) {
-        case GGML_TYPE_F32: {
-            const float * src0_d = (const float *) src0->data;
-            constexpr int vals_per_T = 1;
-            mul_mat_f_switch_cols_per_block(
-                src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
-                ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
-                ne03,              ne3,           s03/vals_per_T, s13,              s3,                 ctx.stream());
-        } break;
-        case GGML_TYPE_F16: {
-            const half2 * src0_d = (const half2 *) src0->data;
-            constexpr int vals_per_T = 2;
-            mul_mat_f_switch_cols_per_block(
-                src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
-                ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
-                ne03,              ne3,           s03/vals_per_T, s13,              s3,                 ctx.stream());
-        } break;
-        case GGML_TYPE_BF16: {
-            const nv_bfloat162 * src0_d = (const nv_bfloat162 *) src0->data;
-            constexpr int vals_per_T = 2;
-            mul_mat_f_switch_cols_per_block(
-                src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
-                ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
-                ne03,              ne3,           s03/vals_per_T, s13,              s3,                 ctx.stream());
-        } break;
-        default:
-            GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
-    }
-}
-
-bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, int64_t ne11) {
-    if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
-        return false;
-    }
-    if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
-        return false;
-    }
-    if (ne11 > 16) {
-        return false;
-    }
-    switch (type) {
-        case GGML_TYPE_F32:
-            return ampere_mma_available(cc);
-        case GGML_TYPE_F16:
-            return turing_mma_available(cc);
-        case GGML_TYPE_BF16:
-            return ampere_mma_available(cc);
-        default:
-            return false;
-    }
-}
@@ -1,5 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
-
-bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, int64_t ne11);
@@ -20,9 +20,6 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
        case GGML_TYPE_Q8_0:
            mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
            break;
-        case GGML_TYPE_MXFP4:
-            mul_mat_q_case<GGML_TYPE_MXFP4>(ctx, args, stream);
-            break;
        case GGML_TYPE_Q2_K:
            mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
            break;
@@ -285,7 +282,6 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -310,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
        return false;
    }

-    if (turing_mma_available(cc)) {
+    if (new_mma_available(cc)) {
        return true;
    }

@@ -1,10 +1,9 @@
 #include "ggml.h"
 #include "common.cuh"
-#include "convert.cuh"
-#include "mmvf.cuh"
+#include "mmv.cuh"

 template <typename T, typename type_acc, int ncols_dst, int block_size>
-static __global__ void mul_mat_vec_f(
+static __global__ void mul_mat_vec(
        const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
        const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
        const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
@@ -38,7 +37,7 @@ static __global__ void mul_mat_vec_f(

    float sumf[ncols_dst] = {0.0f};

-    if constexpr (std::is_same_v<T, float>) {
+    if constexpr (std::is_same<T, float>::value) {
        const float2 * x2 = (const float2 *) x;

        for (int col2 = tid; col2 < ncols2; col2 += block_size) {
@@ -51,10 +50,10 @@ static __global__ void mul_mat_vec_f(
                sumf[j] += tmpx.y*tmpy.y;
            }
        }
-    } else if constexpr (std::is_same_v<T, half>) {
+    } else if constexpr (std::is_same<T, half>::value) {
        const half2 * x2 = (const half2 *) x;

-        if (std::is_same_v<type_acc, float>) {
+        if (std::is_same<type_acc, float>::value) {
            for (int col2 = tid; col2 < ncols2; col2 += block_size) {
                const float2 tmpx = __half22float2(x2[col2]);

@@ -87,19 +86,19 @@ static __global__ void mul_mat_vec_f(
            NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
        }
-    } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+    } else if constexpr (std::is_same<T, nv_bfloat16>::value) {
        const int * x2 = (const int *) x;
        for (int col2 = tid; col2 < ncols2; col2 += block_size) {
            const int tmpx = x2[col2];
 #pragma unroll
            for (int j = 0; j < ncols_dst; ++j) {
                const float2 tmpy = y2[j*stride_col_y2 + col2];
-                sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
-                sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+                sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
+                sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
            }
        }
    } else {
-        static_assert(std::is_same_v<T, void>, "unsupported type");
+        static_assert(std::is_same<T, void>::value, "unsupported type");
    }

 #pragma unroll
@@ -127,7 +126,7 @@ static __global__ void mul_mat_vec_f(
 }

 template <typename T, typename type_acc, int ncols_dst>
-static void launch_mul_mat_vec_f_cuda(
+static void launch_mul_mat_vec_cuda(
        const T * x, const float * y, const int32_t * ids, float * dst,
        const int64_t ncols, const int64_t nrows,
        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
@@ -142,9 +141,11 @@ static void launch_mul_mat_vec_f_cuda(
    GGML_ASSERT(       nsamples_dst  % nsamples_x  == 0);
    const int64_t channel_ratio = nchannels_dst / nchannels_x;
    const int64_t sample_ratio  = nsamples_dst  / nsamples_x;
+    int device;
+    int warp_size;

-    const int device = ggml_cuda_get_device();
-    const int warp_size = ggml_cuda_info().devices[device].warp_size;
+    CUDA_CHECK(cudaGetDevice(&device));
+    warp_size = ggml_cuda_info().devices[device].warp_size;

    int64_t block_size_best = warp_size;
    int64_t niter_best      = (ncols + 2*warp_size - 1) / (2*warp_size);
@@ -160,54 +161,54 @@ static void launch_mul_mat_vec_f_cuda(
        }
    }

-    const int nbytes_shared = warp_size*sizeof(float);
+    const int smem = warp_size*sizeof(float);
    const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
    const dim3 block_dims(block_size_best, 1, 1);
    switch (block_size_best) {
        case   32: {
-            mul_mat_vec_f<T, type_acc, ncols_dst,  32><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec<T, type_acc, ncols_dst,  32><<<block_nums, block_dims, smem, stream>>>
                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case   64: {
-            mul_mat_vec_f<T, type_acc, ncols_dst,  64><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec<T, type_acc, ncols_dst,  64><<<block_nums, block_dims, smem, stream>>>
                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case   96: {
-            mul_mat_vec_f<T, type_acc, ncols_dst,  96><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec<T, type_acc, ncols_dst,  96><<<block_nums, block_dims, smem, stream>>>
                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  128: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, smem, stream>>>
                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  160: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, smem, stream>>>
                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  192: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, smem, stream>>>
                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  224: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, smem, stream>>>
                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  256: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, smem, stream>>>
                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
@@ -219,7 +220,7 @@ static void launch_mul_mat_vec_f_cuda(
 }

 template <typename T, typename type_acc>
-static void mul_mat_vec_f_cuda_switch_ncols_dst(
+static void mul_mat_vec_cuda_switch_ncols_dst(
        const T * x, const float * y, const int32_t * ids, float * dst,
        const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
@@ -229,49 +230,49 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(
        cudaStream_t stream) {
    switch (ncols_dst) {
        case 1:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 1>
+            launch_mul_mat_vec_cuda<T, type_acc, 1>
                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 2:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 2>
+            launch_mul_mat_vec_cuda<T, type_acc, 2>
                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 3:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 3>
+            launch_mul_mat_vec_cuda<T, type_acc, 3>
                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 4:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 4>
+            launch_mul_mat_vec_cuda<T, type_acc, 4>
                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 5:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 5>
+            launch_mul_mat_vec_cuda<T, type_acc, 5>
                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 6:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 6>
+            launch_mul_mat_vec_cuda<T, type_acc, 6>
                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 7:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 7>
+            launch_mul_mat_vec_cuda<T, type_acc, 7>
                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 8:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 8>
+            launch_mul_mat_vec_cuda<T, type_acc, 8>
                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
@@ -283,7 +284,7 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(
 }

 template<typename T>
-static void mul_mat_vec_f_cuda(
+static void mul_mat_vec_cuda(
        const T * x, const float * y, const int32_t * ids, float * dst,
        const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
        const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst,
@@ -291,22 +292,22 @@ static void mul_mat_vec_f_cuda(
        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
        enum ggml_prec prec, cudaStream_t stream) {
-    if constexpr(std::is_same_v<T, half>) {
+    if constexpr(std::is_same<T, half>::value) {
        if (prec == GGML_PREC_DEFAULT) {
-            mul_mat_vec_f_cuda_switch_ncols_dst<T, half>
+            mul_mat_vec_cuda_switch_ncols_dst<T, half>
                (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            return;
        }
    }
-    mul_mat_vec_f_cuda_switch_ncols_dst<T, float>
+    mul_mat_vec_cuda_switch_ncols_dst<T, float>
        (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
         nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
         stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
 }

-void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
    GGML_ASSERT(!ids ||  ids->type == GGML_TYPE_I32);
    GGML_ASSERT(         dst->type == GGML_TYPE_F32);
@@ -354,19 +355,19 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
    switch (src0->type) {
        case GGML_TYPE_F32: {
            const float * src0_d = (const float *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
        } break;
        case GGML_TYPE_F16: {
            const half * src0_d = (const half *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
        } break;
        case GGML_TYPE_BF16: {
            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
        } break;
@@ -375,7 +376,7 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
    }
 }

-void ggml_cuda_op_mul_mat_vec_f(
+void ggml_cuda_op_mul_mat_vec(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
@@ -413,19 +414,19 @@ void ggml_cuda_op_mul_mat_vec_f(
    switch (src0->type) {
        case GGML_TYPE_F32: {
            const float * src0_d = (const float *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
        } break;
        case GGML_TYPE_F16: {
            const half * src0_d = (const half *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
        } break;
        case GGML_TYPE_BF16: {
            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
        } break;
@@ -441,15 +442,15 @@ void ggml_cuda_op_mul_mat_vec_f(
    GGML_UNUSED(src1_padded_row_size);
 }

-bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
+bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
    if (src0_ne[0] % 2 != 0) {
        return false;
    }
    switch (type) {
        case GGML_TYPE_F32:
            if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
-                if (ampere_mma_available(cc)) {
-                    return ne11 <= 3;
+                if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
+                    return ne11 <= 8;
                }
                if (cc >= GGML_CUDA_CC_TURING) {
                    return ne11 <= 4;
@@ -465,9 +466,6 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
        case GGML_TYPE_F16:
            if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
                const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1);
-                if (ampere_mma_available(cc)) {
-                    return src0_small && ne11 == 1;
-                }
                if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
                    return src0_small && ne11 <= 4;
                }
@@ -488,9 +486,6 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
        case GGML_TYPE_BF16:
            if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
                const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1);
-                if (ampere_mma_available(cc)) {
-                    return src0_small && ne11 == 1;
-                }
                if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
                    return src0_small && ne11 <= 4;
                }
@@ -1,11 +1,11 @@
 #include "common.cuh"

-void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);

-void ggml_cuda_op_mul_mat_vec_f(
+void ggml_cuda_op_mul_mat_vec(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream);

-bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
+bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
@@ -13,7 +13,6 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
        case GGML_TYPE_Q5_0:    return vec_dot_q5_0_q8_1;
        case GGML_TYPE_Q5_1:    return vec_dot_q5_1_q8_1;
        case GGML_TYPE_Q8_0:    return vec_dot_q8_0_q8_1;
-        case GGML_TYPE_MXFP4:   return vec_dot_mxfp4_q8_1;
        case GGML_TYPE_Q2_K:    return vec_dot_q2_K_q8_1;
        case GGML_TYPE_Q3_K:    return vec_dot_q3_K_q8_1;
        case GGML_TYPE_Q4_K:    return vec_dot_q4_K_q8_1;
@@ -39,7 +38,6 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
        case GGML_TYPE_Q5_0:    return VDR_Q5_0_Q8_1_MMVQ;
        case GGML_TYPE_Q5_1:    return VDR_Q5_1_Q8_1_MMVQ;
        case GGML_TYPE_Q8_0:    return VDR_Q8_0_Q8_1_MMVQ;
-        case GGML_TYPE_MXFP4:   return VDR_MXFP4_Q8_1_MMVQ;
        case GGML_TYPE_Q2_K:    return VDR_Q2_K_Q8_1_MMVQ;
        case GGML_TYPE_Q3_K:    return VDR_Q3_K_Q8_1_MMVQ;
        case GGML_TYPE_Q4_K:    return VDR_Q4_K_Q8_1_MMVQ;
@@ -386,13 +384,6 @@ static void mul_mat_vec_q_switch_type(
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
                 stream);
            break;
-        case GGML_TYPE_MXFP4:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
-                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
-            break;
        case GGML_TYPE_Q2_K:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
@@ -1,49 +0,0 @@
-#include "ggml-impl.h"
-#include "opt-step-sgd.cuh"
-
-#include <cstdint>
-
-static __global__ void opt_step_sgd_f32(
-    float * __restrict__ x, const float * __restrict__ g,
-    const float * __restrict__ pars, const int64_t k) {
-
-    const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    x[i] = x[i] * (1.0f - pars[0] * pars[1]) - pars[0] * g[i];
-}
-
-static void opt_step_sgd_f32_cuda(
-    float * x, const float * g, const float * __restrict__ pars, const int64_t k, cudaStream_t stream) {
-
-    const dim3 block_dims(CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
-    const dim3 block_nums((k + CUDA_OPT_STEP_SGD_BLOCK_SIZE - 1) / CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
-    opt_step_sgd_f32<<<block_nums, block_dims, 0, stream>>>(x, g, pars, k);
-}
-
-void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0      = dst->src[0];
-    const ggml_tensor * src0_grad = dst->src[1];
-    const ggml_tensor * params    = dst->src[2];
-
-    GGML_ASSERT(src0->type      == GGML_TYPE_F32);
-    GGML_ASSERT(src0_grad->type == GGML_TYPE_F32);
-    GGML_ASSERT(params->type    == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src0_grad));
-    GGML_ASSERT(ggml_is_contiguous(params));
-    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
-    GGML_ASSERT(ggml_nelements(params) == 2);
-
-    float       * src0_d      = (float       *) src0->data;
-    const float * src0_grad_d = (const float *) src0_grad->data;
-    const float * params_d    = (const float *) params->data;
-
-    cudaStream_t stream = ctx.stream();
-
-    const int64_t ne = ggml_nelements(src0);
-
-    opt_step_sgd_f32_cuda(src0_d, src0_grad_d, params_d, ne, stream);
-}
@@ -1,5 +0,0 @@
-#include "common.cuh"
-
-#define CUDA_OPT_STEP_SGD_BLOCK_SIZE 256
-
-void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -1,53 +0,0 @@
-#include "common.cuh"
-
-// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
-template <bool norm>
-static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
-    const int row = blockIdx.x;
-    const int col = threadIdx.x;
-
-    float     sum        = 0.0f;
-    const int num_unroll = 8;
-    float     temp[num_unroll];
-    float     sum_temp[num_unroll] = { 0.0f };
-    for (int i = col; i < ncols;) {
-        for (int j = 0; j < num_unroll; ++j) {
-            if (i < ncols) {
-                temp[j] = x[row * ncols + i];
-            } else {
-                temp[j] = 0;
-            }
-            i += blockDim.x;
-        }
-        for (int j = 0; j < num_unroll; ++j) {
-            sum_temp[j] += temp[j];
-        }
-    }
-    for (int j = 0; j < num_unroll; ++j) {
-        sum += sum_temp[j];
-    }
-
-    // sum up partial sums
-    sum = warp_reduce_sum(sum);
-    if (blockDim.x > WARP_SIZE) {
-        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
-        __shared__ float s_sum[32];
-        const int        warp_id = threadIdx.x / WARP_SIZE;
-        const int        lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = sum;
-        }
-        __syncthreads();
-        sum = 0.0f;
-        if (lane_id < (blockDim.x / WARP_SIZE)) {
-            sum = s_sum[lane_id];
-        }
-        sum = warp_reduce_sum(sum);
-    }
-
-    if (col != 0) {
-        return;
-    }
-
-    dst[row] = norm ? sum / ncols : sum;
-}
@@ -3,6 +3,11 @@

 typedef void (*set_rows_kernel_t)(const char * src, char * dst);

+template<typename src_t, typename dst_t>
+__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
+    convert_flt(src_f, dst_f);
+}
+
 // Generic quantized set_rows kernel template
 template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
 static __global__ void k_set_rows_quant(
@@ -112,7 +117,9 @@ static __global__ void k_set_rows(
    const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
    dst_t * dst_row_ptr    = dst + dst_row*s1 + i02*s2 + i03*s3;

-    dst_row_ptr[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
+    const src_t* src_elem = src0_row + i00;
+    dst_t* dst_elem = dst_row_ptr + i00;
+    set_rows_1(src_elem, dst_elem);

    GGML_UNUSED(ne10);
    GGML_UNUSED(ne13);
@@ -45,7 +45,7 @@ struct soft_max_params {
 #endif // __clang__
 template <bool use_shared, int ncols_template, int block_size_template, typename T>
 static __global__ void soft_max_f32(
-        const float * x, const T * mask, const float * sinks, float * dst, const soft_max_params p) {
+        const float * x, const T * mask, float * dst, const soft_max_params p) {
    const int ncols = ncols_template == 0 ? p.ncols : ncols_template;

    const int tid  = threadIdx.x;
@@ -77,7 +77,7 @@ static __global__ void soft_max_f32(
    // shared memory buffer to cache values between iterations:
    float * vals = use_shared ? buf_iw + WARP_SIZE : dst;

-    float max_val = sinks ? sinks[i02] : -INFINITY;
+    float max_val = -INFINITY;

 #pragma unroll
    for (int col0 = 0; col0 < ncols; col0 += block_size) {
@@ -143,10 +143,6 @@ static __global__ void soft_max_f32(
        tmp = warp_reduce_sum(tmp);
    }

-    if (sinks) {
-        tmp += expf(sinks[i02] - max_val);
-    }
-
    const float inv_sum = 1.0f / tmp;

 #pragma unroll
@@ -187,7 +183,7 @@ static __global__ void soft_max_back_f32(
 }

 template<int... Ns, typename T>
-static void launch_soft_max_kernels(const float * x, const T * mask, const float * sinks, float * dst,
+static void launch_soft_max_kernels(const float * x, const T * mask, float * dst,
                             const soft_max_params & p, cudaStream_t stream, dim3 block_dims, dim3 block_nums, size_t nbytes_shared)
 {
    const int id       = ggml_cuda_get_device();
@@ -200,7 +196,7 @@ static void launch_soft_max_kernels(const float * x, const T * mask, const float
        if (p.ncols == ncols) {
            CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, ncols, block, T>), smpbo);
            soft_max_f32<true, ncols, block><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, mask, sinks, dst, p);
+                (x, mask, dst, p);
            return true;
        }
        return false;
@@ -213,12 +209,12 @@ static void launch_soft_max_kernels(const float * x, const T * mask, const float

    //default case
    CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, 0, 0, T>), smpbo);
-    soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, sinks, dst, p);
+    soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, dst, p);
 }


 template<typename T>
-static void soft_max_f32_cuda(const float * x, const T * mask, const float * sinks, float * dst, const soft_max_params & params, cudaStream_t stream) {
+static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const soft_max_params & params, cudaStream_t stream) {
    int nth = WARP_SIZE;
    const int64_t ncols_x = params.ncols;

@@ -234,10 +230,10 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const float * sin


    if (nbytes_shared <= smpbo) {
-        launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, sinks, dst, params, stream, block_dims, block_nums, nbytes_shared);
+        launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, dst, params, stream, block_dims, block_nums, nbytes_shared);
    } else {
        const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
-        soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, sinks, dst, params);
+        soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, params);
    }
 }

@@ -253,11 +249,9 @@ static void soft_max_back_f32_cuda(
 void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
-    const ggml_tensor * src2 = dst->src[2];

    const float * src0_d = (const float *) src0->data;
    const void  * src1_d = src1 ? (const void *) src1->data : nullptr;
-    const void  * src2_d = src2 ? (const void *) src2->data : nullptr;
    float       *  dst_d = (float *) dst->data;

    cudaStream_t stream = ctx.stream();
@@ -315,9 +309,9 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    params.m1 = m1;

    if (use_f16) {
-        soft_max_f32_cuda(src0_d, (const half  *) src1_d, (const float *) src2_d, dst_d, params, stream);
+        soft_max_f32_cuda(src0_d, (const half  *) src1_d, dst_d, params, stream);
    } else {
-        soft_max_f32_cuda(src0_d, (const float *) src1_d, (const float *) src2_d, dst_d, params, stream);
+        soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, params, stream);
    }
 }

@@ -1,117 +1,87 @@
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
-#define USE_CUB
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
-
-#ifdef USE_CUB
-#include <cub/cub.cuh>
-using namespace cub;
-#endif // USE_CUB
-
 #include "ssm-scan.cuh"

-// We would like to keep pragma unroll for cases where L_template is not 0,
-// so we suppress the clang transformation warning.
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpass-failed"
-#endif // __clang__
-template <size_t splitD, size_t N, size_t L_template>
-__global__ void __launch_bounds__(splitD, 1)
-    ssm_scan_f32(const float *__restrict__ src0, const float *__restrict__ src1, const float *__restrict__ src2,
-                 const float *__restrict__ src3, const float *__restrict__ src4, const float *__restrict__ src5,
+template <size_t splitD, size_t N>
+__global__ void __launch_bounds__(splitD, 2)
+    ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
+                 const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
                 const int32_t * __restrict__ src6, float * __restrict__ dst,
                 const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3,
                 const int src2_nb1, const int src2_nb2, const int src3_nb1,
                 const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
-                 const int64_t s_off, const int64_t d_inner, const int64_t L_param)
-{
-    const size_t L = L_template == 0 ? L_param : L_template;
-    const float *s0_block = (const float *)((const char *)src0 + src6[blockIdx.x] * src0_nb3 + blockIdx.y * splitD * src0_nb2);
-    const float *x_block = (const float *)((const char *)src1 + (blockIdx.x * src1_nb3) + blockIdx.y * splitD * sizeof(float));
-    const float *dt_block = (const float *)((const char *)src2 + (blockIdx.x * src2_nb2) + blockIdx.y * splitD * sizeof(float));
-    const float *A_block = (const float *)((const char *)src3 + blockIdx.y * splitD * src3_nb1);
-    const float *B_block = (const float *)((const char *)src4 + (blockIdx.x * src4_nb3));
-    const float *C_block = (const float *)((const char *)src5 + (blockIdx.x * src5_nb3));
-    float *y_block = (float *)((char *)dst + (blockIdx.x * d_inner * L * sizeof(float)) + blockIdx.y * splitD * sizeof(float));
-    float *s_block = (float *)((char *)dst + s_off + blockIdx.x * src0_nb3 + blockIdx.y * splitD * src0_nb2);
+                 const int64_t s_off, const int64_t d_inner, const int64_t L) {

-    const int stride_x = src1_nb2 / sizeof(float);
-    const int stride_dt = src2_nb1 / sizeof(float);
-    const int stride_B = src4_nb2 / sizeof(float);
-    const int stride_C = src5_nb2 / sizeof(float);
-    const int stride_y = d_inner;
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+    const int bidx = blockIdx.x;  // split along B (sequences)
+    const int bidy = blockIdx.y;  // split along D (d_inner)
+    const int tid  = threadIdx.x;
+    const int wid  = tid / 32;
+    const int wtid = tid % 32;

-    float regA[N];
-    float regs0[N];
+    extern __shared__ float smem[];
+    const int               stride_sA  = N + 1;
+    const int               stride_ss0 = N + 1;
+    float *                 smem_A     = smem;
+    float *                 smem_s0    = smem_A + splitD * stride_sA;

-    __shared__ float smemB[N];
-    __shared__ float smemC[N];
+    const float * s0_block = (const float *) ((const char *) src0 + src6[bidx] * src0_nb3 + bidy * splitD * src0_nb2);
+    const float * x_block  = (const float *) ((const char *) src1 + (bidx * src1_nb3) + bidy * splitD * sizeof(float));
+    const float * dt_block = (const float *) ((const char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
+    const float * A_block  = (const float *) ((const char *) src3 + bidy * splitD * src3_nb1);
+    const float * B_block  = (const float *) ((const char *) src4 + (bidx * src4_nb3));
+    const float * C_block  = (const float *) ((const char *) src5 + (bidx * src5_nb3));
+    float *       y_block  = (float *) ((char *) dst + (bidx * d_inner * L * sizeof(float)) + bidy * splitD * sizeof(float));
+    float *       s_block  = (float *) ((char *) dst + s_off + bidx * src0_nb3 + bidy * splitD * src0_nb2);

-#ifdef USE_CUB
-    using BlockLoad = cub::BlockLoad<float, splitD, N, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
-    using BlockStore = cub::BlockStore<float, splitD, N, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-
-    union CubTempStorage {
-        typename BlockLoad::TempStorage load_temp;
-        typename BlockStore::TempStorage store_temp;
-    };
-    __shared__ CubTempStorage cub_temp_storage;
-
-    BlockLoad(cub_temp_storage.load_temp).Load(A_block, regA);
-    BlockLoad(cub_temp_storage.load_temp).Load(s0_block, regs0);
-#else
    const int stride_s0 = src0_nb2 / sizeof(float);
-    const int stride_A = src3_nb1 / sizeof(float);
+    const int stride_x  = src1_nb2 / sizeof(float);
+    const int stride_dt = src2_nb1 / sizeof(float);
+    const int stride_A  = src3_nb1 / sizeof(float);
+    const int stride_B  = src4_nb2 / sizeof(float);
+    const int stride_C  = src5_nb2 / sizeof(float);
+    const int stride_s  = stride_s0;
+    const int stride_y  = d_inner;
+
+    // can N not be 16? for example 32?
+    if (N == 16) {
 #pragma unroll
-    for (size_t n = 0; n < N; ++n)
-    {
-        regA[n] = A_block[threadIdx.x * stride_A + n];
-        regs0[n] = s0_block[threadIdx.x * stride_s0 + n];
+        for (size_t i = 0; i < splitD / 4; i += 2) {
+            float value = A_block[(wid * warp_size + i) * stride_A + wtid];
+            // todo: bank conflict
+            // I am always confused with how to use the swizzling method to solve
+            // bank conflit. Hoping somebody can tell me.
+            smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
+        }
+#pragma unroll
+        for (size_t i = 0; i < splitD / 4; i += 2) {
+            float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
+            smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
+        }
    }
-#endif

-#pragma unroll
-    for (size_t i = 0; i < L; i++)
-    {
-        if (threadIdx.x < N)
-        {
-            smemB[threadIdx.x] = B_block[i * stride_B + threadIdx.x];
-            smemC[threadIdx.x] = C_block[i * stride_C + threadIdx.x];
+    __syncthreads();
+
+    for (int64_t i = 0; i < L; i++) {
+        float dt_soft_plus = dt_block[i * stride_dt + tid];
+        if (dt_soft_plus <= 20.0f) {
+            dt_soft_plus = log1pf(exp(dt_soft_plus));
        }
-        __syncthreads();
-
-        float dt_soft_plus = dt_block[i * stride_dt + threadIdx.x];
-        if (dt_soft_plus <= 20.0f)
-        {
-            dt_soft_plus = log1pf(expf(dt_soft_plus));
-        }
-        float x_dt = x_block[i * stride_x + threadIdx.x] * dt_soft_plus;
-
+        float x_dt = x_block[i * stride_x + tid] * dt_soft_plus;
        float sumf = 0.0f;
 #pragma unroll
-        for (size_t n = 0; n < N; n++)
-        {
-            float state = regs0[n] * expf(dt_soft_plus * regA[n]) + smemB[n] * x_dt;
-            sumf += state * smemC[n];
-            regs0[n] = state;
+        for (size_t j = 0; j < N; j++) {
+            float state = (smem_s0[tid * stride_ss0 + j] * expf(dt_soft_plus * smem_A[tid * stride_sA + j])) +
+                          (B_block[i * stride_B + j] * x_dt);
+            sumf += state * C_block[i * stride_C + j];
+            if (i == L - 1) {
+                s_block[tid * stride_s + j] = state;
+            } else {
+                smem_s0[tid * stride_ss0 + j] = state;
+            }
        }
-        y_block[i * stride_y + threadIdx.x] = sumf;
+        __syncthreads();
+        y_block[i * stride_y + tid] = sumf;
    }
-
-#ifdef USE_CUB
-    BlockStore(cub_temp_storage.store_temp).Store(s_block, regs0);
-#else
-    const int stride_s = stride_s0;
-#pragma unroll
-    for (size_t n = 0; n < N; ++n)
-    {
-        s_block[threadIdx.x * stride_s + n] = regs0[n];
-    }
-#endif
 }
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif // __clang__

 // assumes as many threads as d_state
 template <int splitH, int d_state>
@@ -231,11 +201,11 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
                              const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
                              const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
                              cudaStream_t stream) {
-    const int threads = 128;
    // NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
    if (src3_nb1 == sizeof(float)) {
        // Mamba-2
        if (d_state == 128) {
+            const int threads = 128;
            GGML_ASSERT(d_state % threads == 0);
            // NOTE: can be any power of two between 4 and 64
            const int splitH = 16;
@@ -259,6 +229,7 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
            GGML_ABORT("doesn't support d_state!=(128 or 256).");
        }
    } else {
+        const int threads = 128;
        // Mamba-1
        GGML_ASSERT(n_head % threads == 0);
        GGML_ASSERT(head_dim == 1);
@@ -266,63 +237,10 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
        const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1);
        const int  smem_size = (threads * (d_state + 1) * 2) * sizeof(float);
        if (d_state == 16) {
-            switch (n_tok)
-            {
-            case 1:
-                ssm_scan_f32<threads, 16, 1><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
+            ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
+                src0, src1, src2, src3, src4, src5, src6, dst,
                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            case 2:
-                ssm_scan_f32<threads, 16, 2><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
-                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            case 3:
-                ssm_scan_f32<threads, 16, 3><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
-                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            case 4:
-                ssm_scan_f32<threads, 16, 4><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
-                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            case 5:
-                ssm_scan_f32<threads, 16, 5><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
-                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            case 6:
-                ssm_scan_f32<threads, 16, 6><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
-                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            case 7:
-                ssm_scan_f32<threads, 16, 7><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
-                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            case 8:
-                ssm_scan_f32<threads, 16, 8><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
-                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            default:
-                ssm_scan_f32<threads, 16, 0><<<blocks, threads, smem_size, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
-                src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
-                break;
-            }
        } else {
            GGML_ABORT("doesn't support d_state!=16.");
        }
@@ -1,15 +1,19 @@
-#include "sum.cuh"
-#include "sumrows.cuh"
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
+#define USE_CUB
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070

-#ifdef GGML_CUDA_USE_CUB
+#ifdef USE_CUB
 #include <cub/cub.cuh>
 using namespace cub;
-#endif  // GGML_CUDA_USE_CUB
+#endif // USE_CUB
+
+#include "sumrows.cuh"
+#include "sum.cuh"

 #include <cstdint>

 void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
-#ifdef GGML_CUDA_USE_CUB
+#ifdef USE_CUB
    size_t tmp_size = 0;
    DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
@@ -19,7 +23,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int
    // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
    sum_rows_f32_cuda(x, dst, ne, 1, stream);
    GGML_UNUSED(pool);
-#endif // GGML_CUDA_USE_CUB
+#endif // USE_CUB
 }

 void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -1,17 +1,9 @@
-#include "reduce_rows.cuh"
 #include "sumrows.cuh"

 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    const int  id  = ggml_cuda_get_device();
-    const int  nsm = ggml_cuda_info().devices[id].nsm;
+    const dim3 block_dims(WARP_SIZE, 1, 1);
    const dim3 block_nums(nrows, 1, 1);
-    if ((nrows / nsm) < 2) {
-        const dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-    } else {
-        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
-        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-    }
+    reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 }

 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -27,17 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t ncols = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

+    const dim3 block_dims(WARP_SIZE, 1, 1);
    const dim3 block_nums(nrows, 1, 1);

-    const int id  = ggml_cuda_get_device();
-    const int nsm = ggml_cuda_info().devices[id].nsm;
-    if ((nrows / nsm) < 2) {
-        // Increase num threads to 512 for small nrows to better hide the latency
-        const dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
-    } else {
-        // Enough active SMs to hide latency, use smaller blocks to allow better scheduling
-        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
-        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
-    }
+    reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
 }
@@ -1,5 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_MXFP4);
@@ -300,81 +300,6 @@ void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    ggml_cuda_op_unary_gated<op_gelu_quick>(ctx, dst);
 }

-// swiglu_oai
-
-template <typename T>
-static __global__ void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, float alpha, float limit) {
-    const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    // perform base op and multiply with gate (either offset in same tensor or a separate one)
-    const int64_t j0 = (i / n) * o0 + (i % n);
-    const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
-
-    float xi = x[j0];
-    float gi = g[j1];
-    xi = fminf(xi, limit);
-    gi = fmaxf(fminf(gi, limit), -limit);
-
-    float out_glu = xi / (1.0f + expf(-xi * alpha));
-    out_glu = out_glu * (1.0f + gi);
-
-    dst[i] = out_glu;
-}
-
-template <typename T>
-static void swiglu_oai_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, const float alpha, const float limit, cudaStream_t stream) {
-    const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE;
-    swiglu_oai_kernel<<<num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream>>>(x, g, dst, k, n, o0, o1, alpha, limit);
-}
-
-void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-    void * src0_d = src0->data;
-    void * src1_d = src1 ? src1->data : src0->data;
-    const int64_t src0_o = src0->nb[1];
-    const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
-    void * dst_d = dst->data;
-    const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(ggml_is_contiguous_1(src0));
-    GGML_ASSERT(src0->nb[0] == ggml_element_size(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->type == dst->type);
-    GGML_ASSERT(dst->ne[0] == nc);
-    GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
-
-    if (src1) {
-        GGML_ASSERT(ggml_is_contiguous_1(src1));
-        GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
-        GGML_ASSERT(src1->ne[0] == nc);
-        GGML_ASSERT(src0->type == src1->type);
-    }
-
-    //const int32_t swapped = ((const int32_t *) dst->op_params)[1];
-    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
-    const float alpha = ggml_get_op_params_f32(dst, 2);
-    const float limit = ggml_get_op_params_f32(dst, 3);
-
-    float * src0_p = (float *) src0_d;
-    float * src1_p = (float *) src1_d;
-
-    if (!src1) {
-        src0_p += swapped ? nc : 0;
-        src1_p += swapped ? 0 : nc;
-    }
-
-    swiglu_oai_cuda(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
-}
-
 /* silu_back */

 static __device__ __forceinline__ float op_silu_back(float grad, float x) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Francis Couture-Harpin	145401c9e3	context : fix logits size overflow for huge batches	2025-08-04 22:26:46 -04:00
Francis Couture-Harpin	f16a843a38	context : fix overflow when re-ordering huge outputs	2025-08-04 22:01:28 -04:00