mtmd-helper : bug fix to token batching in mtmd (#13650 )

* Update mtmd-helper.cpp * Update tools/mtmd/mtmd-helper.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
model : fix llama4 graph (#13663 )
2026-07-01 01:57:43 +02:00 · 2025-05-20 18:55:30 +02:00 · 2025-05-20 19:21:04 +03:00 · 2025-05-20 16:13:16 +03:00 · 2025-05-20 14:45:07 +02:00 · 2025-05-20 12:03:17 +02:00
75 changed files with 2996 additions and 1842 deletions
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04

 ## Build Image

@@ -5,6 +5,10 @@ inputs:
    description: 'CURL version'
    required: false
    default: '8.6.0_6'
+  architecture:
+    description: 'Architecture of the libcurl to download'
+    required: false
+    default: 'win64'
 outputs:
  curl_path:
    description: "Path to the downloaded libcurl"
@@ -18,8 +22,9 @@ runs:
      shell: powershell
      env:
        CURL_VERSION: ${{ inputs.curl_version }}
+        ARCHITECTURE: ${{ inputs.architecture }}
      run: |
-        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
+        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
        mkdir $env:RUNNER_TEMP/libcurl
        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
@@ -899,7 +899,7 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
@@ -238,14 +238,19 @@ jobs:
      matrix:
        include:
          - build: 'cpu-x64'
+            arch: 'x64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
          #- build: 'openblas-x64'
+          #  arch: 'x64'
          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'vulkan-x64'
+            arch: 'x64'
            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
          - build: 'cpu-arm64'
+            arch: 'arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
          - build: 'opencl-adreno-arm64'
+            arch: 'arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

    steps:
@@ -312,6 +317,8 @@ jobs:
      - name: libCURL
        id: get_libcurl
        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}

      - name: Build
        id: cmake_build
@@ -339,7 +346,7 @@ jobs:
        env:
          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*

      - name: Upload artifacts
@@ -441,7 +448,7 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
@@ -572,4 +572,11 @@ automatically. For example:
 $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 ```

-## References
+## Dependencies
+
+- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
+- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
+- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
+- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
+- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
+- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
@@ -121,8 +121,8 @@ if (LLAMA_LLGUIDANCE)

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.7.19 (+ fancy-regex build fix):
-        GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
+        # v0.7.20 (+ fix to build on GCC 15):
+        GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
@@ -1445,6 +1445,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_keep = value;
        }
    ));
+    add_opt(common_arg(
+        {"--swa-full"},
+        string_format("use full-size SWA cache (default: %s)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
+        [](common_params & params) {
+            params.swa_full = true;
+        }
+    ).set_env("LLAMA_ARG_SWA_FULL"));
    add_opt(common_arg(
        {"--no-context-shift"},
        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2057,13 +2065,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.grp_attn_w = value;
        }
    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(common_arg(
-        {"-dkvc", "--dump-kv-cache"},
-        "verbose print of the KV cache",
-        [](common_params & params) {
-            params.dump_kv_cache = true;
-        }
-    ));
    add_opt(common_arg(
        {"-nkvo", "--no-kv-offload"},
        "disable KV offload",
@@ -2585,7 +2586,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.n_junk = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
        {"--pos"}, "N",
        string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2648,7 +2649,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.is_pp_shared = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
        {"-npp"}, "n0,n1,...",
        "number of prompt tokens",
@@ -2880,6 +2881,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.chat_template = read_file(value);
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    add_opt(common_arg(
+        {"--no-prefill-assistant"},
+        string_format(
+            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
+            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
+        ),
+        [](common_params & params) {
+            params.prefill_assistant = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -1102,6 +1102,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
    }

+    mparams.progress_callback           = params.load_progress_callback;
+    mparams.progress_callback_user_data = params.load_progress_callback_user_data;
+
    return mparams;
 }

@@ -1133,6 +1136,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
+    cparams.swa_full          = params.swa_full;

    if (params.reranking) {
        cparams.embeddings    = true;
@@ -1325,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
    return text;
 }

-//
-// KV cache utils
-//
-
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
-        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        int seq_count = 0;
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] >= 0) { seq_count++; }
-        }
-        putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
-    }
-
-    printf("\n=== Done dumping\n");
-}
-
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
-        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    std::unordered_map<llama_seq_id, size_t> seqs;
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] < 0) { continue; }
-            if (seqs.find(cs_curr[j]) == seqs.end()) {
-                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                const size_t sz = seqs.size();
-                seqs[cs_curr[j]] = sz;
-            }
-        }
-        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-    }
-
-    printf("=== Sequence legend: ");
-    for (const auto & it : seqs) {
-        printf("%zu=%d, ", it.second, it.first);
-    }
-    printf("'+'=other sequence ids");
-
-    c_curr = view.cells;
-    cs_curr = view.cells_sequences;
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] >= 0) {
-                const auto & it = seqs.find(cs_curr[j]);
-                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
-            } else {
-                putchar('.');
-            }
-        }
-        putchar(' ');
-    }
-
-    printf("\n=== Done dumping\n");
-}
-
 //
 // Embedding utils
 //
@@ -323,13 +323,13 @@ struct common_params {
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = true;  // context shift on inifinite text generation
+    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
-    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
@@ -368,6 +368,7 @@ struct common_params {
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

    std::vector<std::string> api_keys;

@@ -427,6 +428,11 @@ struct common_params {

    // common params
    std::string out_file; // output filename for all example programs
+    // optional callback for model loading progress and cancellation:
+    // called with a progress value between 0.0 and 1.0.
+    // return false from callback to abort model loading or true to continue
+    llama_progress_callback load_progress_callback = NULL;
+    void *                  load_progress_callback_user_data = NULL;
 };

 // call once at the start of a program if it uses libcommon
@@ -615,16 +621,6 @@ std::string common_detokenize(
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

-//
-// KV cache utils
-//
-
-// Dump the KV cache view with the number of sequences per cell.
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
-
-// Dump the KV cache view showing individual sequences in each cell (long output).
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
-
 //
 // Embedding utils
 //
@@ -308,6 +308,7 @@ class ModelBase:
                            gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
                            gguf.MODEL_TENSOR.POSNET_NORM1,
                            gguf.MODEL_TENSOR.POSNET_NORM2,
+                            gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
                        )
                    )
                    or not new_name.endswith(".weight")
@@ -2092,6 +2093,26 @@ class Llama4Model(LlamaModel):
        return super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("Llama4ForConditionalGeneration")
+class Llama4VisionModel(VisionModel):
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
+        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
+        assert self.hparams["hidden_act"] == "gelu"
+        self.gguf_writer.add_vision_use_gelu(True)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid # unused
+        if "multi_modal_projector" in name or "vision_model" in name:
+            # process vision tensors
+            if "positional_embedding_vlm" in name and ".weight" not in name:
+                name += ".weight"
+            return [(self.map_tensor_name(name), data_torch)]
+        return []
+
+
@ModelBase.register("Mistral3ForConditionalGeneration")
 class Mistral3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
@@ -56,60 +56,82 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

 ## Model Supports

-| Model Name                  | FP16  | Q8_0 | Q4_0 |
+| Model Name                  | FP16  | Q4_0 | Q8_0 |
 |:----------------------------|:-----:|:----:|:----:|
-| AquilaChat2-7B              |   √   |   √  |   √  |
-| Baichuan-7b                 |   √   |   √  |   √  |
-| Baichuan2-7B-Chat           |   √   |   √  |   √  |
-| bitnet_b1_58-large          |   √   |   √  |   √  |
-| bloom-560m                  |   √   |   x  |   √  |
-| bloomz-alpaca-560m          |   √   |   x  |   √  |
-| c4ai-command-r-35B-v01      |   x   |   x  |   x  |
-| chatglm3-6B                 |   x   |   x  |   x  |
-| chinese-alpaca-2-1.3b       |   √   |   √  |   √  |
-| CodeShell-7B                |   √   |   √  |   √  |
-| deepseek-ai_deepseek-coder-1.3B-base | x |   x  |   x  |
-| deepseek-ai_DeepSeek-V2-Lite | x   |   x  |   x   |
-| deepseek-coder-6.7B-instruct | x   |   x  |   x   |
-| DeepSeek-V2-Lite-64x1.5B    |   x   |   x  |   x  |
-| falcon-7b-instruct          |   √   |   √  |   √  |
-| flan-t5-large               |   √   |   √  |   √  |
-| gemma-2-9b-it               |   √   |   √  |   √  |
-| glm-4-9B                    |   x   |   x  |   x  |
-| gpt2                        |   √   |   √  |   √  |
-| Gpt2-163M                   |   √   |   √  |   √  |
-| granite-3B-code-instruct    |   √   |   √  |   √  |
+| Llama-2                     |   √   |   √  |   √  |
+| Llama-3                     |   √   |   √  |   √  |
+| Mistral-7B                  |   √   |   √  |   √  |
+| Mistral MOE                 |   √   |   √  |   √  |
+| DBRX                        |   -   |   -  |   -  |
+| Falcon                      |   √   |   √  |   √  |
+| Chinese LLaMA/Alpaca        |   √   |   √  |   √  |
+| Vigogne(French)             |   √   |   √  |   √  |
+| BERT                        |   x   |   x  |   x  |
+| Koala                       |   √   |   √  |   √  |
+| Baichuan                    |   √   |   √  |   √  |
+| Aquila 1 & 2                |   √   |   √  |   √  |
+| Starcoder models            |   √   |   √  |   √  |
+| Refact                      |   √   |   √  |   √  |
+| MPT                         |   √   |   √  |   √  |
+| Bloom                       |   √   |   √  |   √  |
+| Yi models                   |   √   |   √  |   √  |
+| stablelm models             |   √   |   √  |   √  |
+| DeepSeek models             |   x   |   x  |   x  |
+| Qwen models                 |   √   |   √  |   √  |
+| PLaMo-13B                   |   √   |   √  |   √  |
+| Phi models                  |   √   |   √  |   √  |
+| PhiMoE                      |   √   |   √  |   √  |
+| GPT-2                       |   √   |   √  |   √  |
+| Orion                       |   √   |   √  |   √  |
+| InternlLM2                  |   √   |   √  |   √  |
+| CodeShell                   |   √   |   √  |   √  |
+| Gemma                       |   √   |   √  |   √  |
+| Mamba                       |   √   |   √  |   √  |
+| Xverse                      |   √   |   √  |   √  |
+| command-r models            |   √   |   √  |   √  |
+| Grok-1                      |   -   |   -  |   -  |
+| SEA-LION                    |   √   |   √  |   √  |
 | GritLM-7B                   |   √   |   √  |   √  |
-| internlm2_5-7b-chat         |   √   |   √  |   √  |
-| koala-7B-HF                 |   √   |   √  |   √  |
-| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
-| Llama-3-Smaug-8B            |   √   |   √  |   √  |
-| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
-| Llama3-8B                   |   √   |   √  |   √  |
-| Llama3-8b-chinese           |   √   |   √  |   √  |
-| mamba-130m-hf               |   √   |   √  |   √  |
-| Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
-| Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
-| mpt-7B                      |   √   |   √  |   √  |
-| OLMo-1B-hf                  |   √   |   √  |   √  |
-| OpenELM-3B-Instruct         |   √   |   √  |   √  |
-| Orion-14b-base              |   √   |   √  |   √  |
-| phi1                        |   x   |   x  |   x  |
-| phi2                        |   x   |   x  |   x  |
-| Phi-3-mini-4k-instruct      |   √   |   √  |   √  |
-| plamo-13b                   |   √   |   √  |   √  |
-| pythia-70M                  |   x   |   x  |   x  |
-| Qwen-7B                     |   √   |   √  |   √  |
-| Qwen2-1.5B-Instruct         |   √   |   x  |   √  |
-| Refact-1_6B-fim             |   √   |   √  |   √  |
-| SmolLM-135M                 |   √   |   √  |   √  |
-| stablelm-zephyr             |   x   |   x  |   x  |
-| stablelm-2-zephyr-1_6b      |   x   |   x  |   x  |
-| starcoderbase-1b            |   √   |   √  |   √  |
-| starcoder2-3b               |   √   |   √  |   √  |
-| vigogne-7b-chat             |   √   |   √  |   √  |
-| xverse-7b-chat              |   √   |   √  |   √  |
-| Yi-6b-Chat                  |   √   |   √  |   √  |
+| OLMo                        |   √   |   √  |   √  |
+| OLMo 2                      |   √   |   √  |   √  |
+| OLMoE                       |   √   |   √  |   √  |
+| Granite models              |   √   |   √  |   √  |
+| GPT-NeoX                    |   √   |   √  |   √  |
+| Pythia                      |   √   |   √  |   √  |
+| Snowflake-Arctic MoE        |   -   |   -  |   -  |
+| Smaug                       |   √   |   √  |   √  |
+| Poro 34B                    |   √   |   √  |   √  |
+| Bitnet b1.58 models         |   √   |   x  |   x  |
+| Flan-T5                     |   √   |   √  |   √  |
+| Open Elm models             |   x   |   √  |   √  |
+| chatGLM3-6B + ChatGLM4-9b +  GLMEdge-1.5b + GLMEdge-4b    |   √   |   √  |   √  |
+| GLM-4-0414                  |   √   |   √  |   √  |
+| SmolLM                      |   √   |   √  |   √  |
+| EXAONE-3.0-7.8B-Instruct    |   √   |   √  |   √  |
+| FalconMamba Models          |   √   |   √  |   √  |
+| Jais Models                 |   -   |   x  |   x  |
+| Bielik-11B-v2.3             |   √   |   √  |   √  |
+| RWKV-6                      |   -   |   √  |   √  |
+| QRWKV-6                     |   √   |   √  |   √  |
+| GigaChat-20B-A3B            |   x   |   x  |   x  |
+| Trillion-7B-preview         |   √   |   √  |   √  |
+| Ling models                 |   √   |   √  |   √  |
+
+
+**Multimodal**
+| Model Name                  | FP16  | Q4_0 | Q8_0 |
+|:----------------------------|:-----:|:----:|:----:|
+| LLaVA 1.5 models, LLaVA 1.6 models      |   x   |   x  |   x  |
+|  BakLLaVA                   |   √   |   √  |   √  |
+|  Obsidian                   |   √   |   -  |   -  |
+|  ShareGPT4V                 |   x   |   -  |   -  |
+|  MobileVLM 1.7B/3B models   |   -   |   -  |   -  |
+|  Yi-VL                      |   -   |   -  |   -  |
+|  Mini CPM                   |   √   |   √  |   √  |
+|  Moondream                  |   √   |   √  |   √  |
+|  Bunny                      |   √   |   -  |   -  |
+|  GLM-EDGE                   |   √   |   √  |   √  |
+|  Qwen2-VL                   |   √   |   √  |   √  |



@@ -17,25 +17,25 @@

 **SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.

-**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
+**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to Intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
 - **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
+- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

 ### Llama.cpp + SYCL

-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
+The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
+SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.

 ## Recommended Release

-The SYCL backend would be broken by some PRs due to no online CI.
-
-The following release is verified with good quality:
+The following releases are verified and recommended:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
 |-|-|-|-|-|
+|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
 |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||

@@ -106,15 +106,14 @@ SYCL backend supports Intel GPU Family:
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
-| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake    |
-| Intel iGPU                    | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
+| Intel Arc Series              | Support | Arc 770, 730M, Arc A750, B580         |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
+| Intel iGPU                    | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7  |

 *Notes:*

 - **Memory**
  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
-
  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.

 - **Execution Unit (EU)**
@@ -138,9 +137,11 @@ Note: AMD GPU support is highly experimental and is incompatible with F16.
 Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.

 ## Docker
-The docker build option is currently limited to *intel GPU* targets.
+
+The docker build option is currently limited to *Intel GPU* targets.

 ### Build image
+
 ```sh
 # Using FP16
 docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
@@ -148,9 +149,10 @@ docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f

 *Notes*:

-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
+To build in default FP32 *(Slower than FP16 alternative)*, set `--build-arg="GGML_SYCL_F16=OFF"` in the previous command.

 You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
+Check the [documentation for Docker](../docker.md) to see the available images.

 ### Run container

@@ -250,7 +252,7 @@ sycl-ls

 - **Intel GPU**

-When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
+When targeting an intel GPU, the user should expect one or more devices among the available SYCL devices. Please make sure that at least one GPU is present via `sycl-ls`, for instance `[level_zero:gpu]` in the sample output below:

 ```
 [opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
@@ -282,7 +284,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:

 #### Intel GPU

-```
+```sh
 ./examples/sycl/build.sh
 ```

@@ -351,7 +353,7 @@ cmake --build build --config Release -j -v

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).

 ##### Check device

@@ -398,11 +400,15 @@ Choose one of following methods to run.

 ```sh
 ./examples/sycl/run-llama2.sh 0
+# OR
+./examples/sycl/run-llama3.sh 0
 ```
 - Use multiple devices:

 ```sh
 ./examples/sycl/run-llama2.sh
+# OR
+./examples/sycl/run-llama3.sh
 ```

 2. Command line
@@ -425,13 +431,13 @@ Examples:
 - Use device 0:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
 ```

 - Use multiple devices:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
 ```

 *Notes:*
@@ -452,7 +458,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 1. Install GPU driver

-Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
+Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).

 2. Install Visual Studio

@@ -629,7 +635,7 @@ Once it is completed, final results will be in **build/Release/bin**

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).

 ##### Check device

@@ -648,7 +654,7 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 build\bin\llama-ls-sycl-device.exe
 ```

-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *Intel GPU* it would look like the following:
 ```
 found 2 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@@ -658,13 +664,14 @@ found 2 SYCL devices:
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|

 ```
+
 #### Choose level-zero devices

 |Chosen Device ID|Setting|
 |-|-|
-|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
+|0|Default option. You may also want to `set ONEAPI_DEVICE_SELECTOR="level_zero:0"`|
 |1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
-|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
+|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|

 #### Execute

@@ -673,7 +680,13 @@ Choose one of following methods to run.
 1. Script

 ```
-examples\sycl\win-run-llama2.bat
+examples\sycl\win-run-llama-2.bat
+```
+
+or
+
+```
+examples\sycl\win-run-llama-3.bat
 ```

 2. Command line
@@ -697,13 +710,13 @@ Examples:
 - Use device 0:

 ```
-build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
 ```

 - Use multiple devices:

 ```
-build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
 ```


@@ -714,7 +727,9 @@ Note:
 ```sh
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
+
 Or
+
 ```sh
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```
@@ -726,15 +741,17 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 | Name               | Value                                 | Function                                    |
 |--------------------|---------------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
+| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.           |
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)             | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
-| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
+| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path. (1.) |
 | GGML_SYCL_GRAPH    | ON *(default)* \|OFF *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

+1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
+
 #### Runtime

 | Name              | Value            | Function                                                                                                                  |
@@ -752,7 +769,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 ## Q&A

- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+- Error:  `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.

  - Potential cause: Unavailable oneAPI installation or not set ENV variables.
  - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
@@ -781,18 +798,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512

  It's same for other projects including llama.cpp SYCL backend.

- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
+- `Native API failed. Native API returns: 39 (UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)`, `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 3503030272 Bytes of memory on device`, or `failed to allocate SYCL0 buffer`

-  Device Memory is not enough.
+  You are running out of Device Memory.

  |Reason|Solution|
  |-|-|
-  |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
-  |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
+  | The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
+  | The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|

 ### **GitHub contribution**:
-Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
+Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.

 ## TODO

- NA
+- Review ZES_ENABLE_SYSMAN: https://github.com/intel/compute-runtime/blob/master/programmers-guide/SYSMAN.md#support-and-limitations
@@ -22,6 +22,9 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

@@ -74,4 +74,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
 (tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
 (tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
 (tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
+
+# Llama 4 Scout
+(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
 ```
@@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
    const int N = 5;  // n-gram size
    const int G = 15; // max verification n-grams

-    const bool dump_kv_cache = params.dump_kv_cache;
-
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
    // here we keep adding new n-grams as we go
    ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);

-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
-
    const auto t_dec_start = ggml_time_us();

    // sample first token
@@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
    }

    while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
        // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
        //
        // Example for W = 5, N = 4, G = 2:
@@ -473,8 +462,6 @@ int main(int argc, char ** argv) {

    common_sampler_free(smpl);

-    llama_kv_cache_view_free(&kvc_view);
-
    llama_batch_free(batch);

    llama_backend_free();
@@ -24,8 +24,6 @@ int main(int argc, char ** argv){
    // max. number of additional tokens to draft if match is found
    const int n_draft = params.speculative.n_max;

-    const bool dump_kv_cache = params.dump_kv_cache;
-
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -110,18 +108,9 @@ int main(int argc, char ** argv){

    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);

-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
-
    const auto t_dec_start = ggml_time_us();

    while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
        // print current draft sequence
        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());

@@ -1,3 +1,14 @@
 # llama.cpp/example/parallel

 Simplified simulation of serving incoming requests in parallel
+
+## Example
+
+Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
+
+```bash
+llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
+```
+
+> [!NOTE]
+> It's recommended to use base models with this example. Instruction tuned models might not be able to properly follow the custom chat template specified here, so the results might not be as expected.
@@ -34,11 +34,61 @@ static std::string k_system =
 R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
 The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.

-User: Recommend a nice restaurant in the area.
-Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
-User: Who is Richard Feynman?
-Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
-User:)";
+User:
+Recommend a nice restaurant in the area.
+Assistant:
+I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
+User:
+Who is Richard Feynman?
+Assistant:
+Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
+)";
+
+static std::vector<std::string> k_questions = {
+    "What is the tallest mountain in the world?",
+    "Who was the first person to win two Nobel Prizes?",
+    "Which country invented paper?",
+    "What organ is primarily responsible for pumping blood throughout the body?",
+    "Which planet is known for its prominent ring system?",
+    "Who directed the movie 'Inception'?",
+    "What is the freezing point of water in Fahrenheit?",
+    "Which animal is known to have the longest lifespan?",
+    "What language has the most native speakers worldwide?",
+    "What is the capital city of Canada?",
+    "Who is credited with inventing the World Wide Web?",
+    "Which metal is liquid at room temperature?",
+    "What is the term for an animal that eats both plants and meat?",
+    "Who painted 'The Starry Night'?",
+    "What gas do humans exhale that plants use for photosynthesis?",
+    "What year did World War II end?",
+    "Which continent has the most countries?",
+    "Who wrote the novel 'Frankenstein'?",
+    "What does DNA stand for?",
+    "What is the main ingredient in traditional Japanese miso soup?"
+};
+
+static std::vector<std::string> k_answers = {
+    "The tallest mountain in the world is Mount Everest.",
+    "Marie Curie was the first person to win two Nobel Prizes.",
+    "Paper was invented in China.",
+    "The heart is the organ responsible for pumping blood.",
+    "Saturn is known for its prominent ring system.",
+    "Christopher Nolan directed the movie 'Inception'.",
+    "The freezing point of water in Fahrenheit is 32°F.",
+    "The bowhead whale is known to have the longest lifespan among mammals.",
+    "Mandarin Chinese has the most native speakers in the world.",
+    "The capital city of Canada is Ottawa.",
+    "Tim Berners-Lee is credited with inventing the World Wide Web.",
+    "Mercury is the metal that is liquid at room temperature.",
+    "An animal that eats both plants and meat is called an omnivore.",
+    "'The Starry Night' was painted by Vincent van Gogh.",
+    "Humans exhale carbon dioxide, which plants use in photosynthesis.",
+    "World War II ended in 1945.",
+    "Africa is the continent with the most countries.",
+    "The novel 'Frankenstein' was written by Mary Shelley.",
+    "DNA stands for Deoxyribonucleic Acid.",
+    "The main ingredient in traditional Japanese miso soup is fermented soybean paste."
+};

 static std::vector<std::string> k_prompts = {
    "What is the meaning of life?",
@@ -49,7 +99,7 @@ static std::vector<std::string> k_prompts = {
    "What is the best way to learn a new language?",
    "How to get a job at Google?",
    "If you could have any superpower, what would it be?",
-    "I want to learn how to play the piano.",
+    "I want to learn how to play the piano. What would be the best way to do it?",
 };

 struct client {
@@ -68,6 +118,7 @@ struct client {
    int64_t t_start_prompt;
    int64_t t_start_gen;

+    int32_t n_past    = 0;
    int32_t n_prompt  = 0;
    int32_t n_decoded = 0;
    int32_t i_batch   = -1;
@@ -107,6 +158,7 @@ int main(int argc, char ** argv) {
    common_params params;

    params.n_predict = 128;
+    params.n_junk = 0;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
@@ -126,7 +178,11 @@ int main(int argc, char ** argv) {
    // insert new requests as soon as the previous one is done
    const bool cont_batching = params.cont_batching;

-    const bool dump_kv_cache = params.dump_kv_cache;
+    // is the system prompt shared in the cache
+    const bool is_sp_shared = params.is_pp_shared;
+
+    // extra text to insert in each client's prompt in order to make it larger
+    const int32_t n_junk = params.n_junk;

    // init llama.cpp
    llama_backend_init();
@@ -169,6 +225,7 @@ int main(int argc, char ** argv) {
    }

    std::vector<llama_token> tokens_system;
+
    tokens_system = common_tokenize(ctx, k_system, true);
    const int32_t n_tokens_system = tokens_system.size();

@@ -182,15 +239,13 @@ int main(int argc, char ** argv) {
    int32_t n_total_gen    = 0;
    int32_t n_cache_miss   = 0;

-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
-
    const auto t_main_start = ggml_time_us();

    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
    LOG_INF("\n");

-    {
+    if (is_sp_shared) {
        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);

        for (int32_t i = 0; i < n_tokens_system; ++i) {
@@ -213,11 +268,6 @@ int main(int argc, char ** argv) {
    LOG_INF("Processing requests ...\n\n");

    while (true) {
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
        common_batch_clear(batch);

        // decode any currently ongoing sequences
@@ -228,7 +278,7 @@ int main(int argc, char ** argv) {

            client.i_batch = batch.n_tokens;

-            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
+            common_batch_add(batch, client.sampled, client.n_past++, { client.id + 1 }, true);

            client.n_decoded += 1;
        }
@@ -254,9 +304,23 @@ int main(int argc, char ** argv) {
                    client.t_start_gen    = 0;

                    client.input    = k_prompts[rand() % k_prompts.size()];
-                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

+                    // construct the prompt:
+                    // [system prompt] + [junk] + [user prompt]
+                    client.n_past = 0;
+                    client.prompt = "";
+                    if (is_sp_shared) {
+                        client.n_past = n_tokens_system;
+                    } else {
+                        client.prompt += k_system;
+                    }
+                    for (int i = 0; i < n_junk; ++i) {
+                        const int r = rand() % k_questions.size();
+                        client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
+                    }
+                    client.prompt += "User:\n" + client.input + "\nAssistant:\n";
+
                    common_sampler_reset(client.smpl);

                    // do not prepend BOS because we have a system prompt!
@@ -264,7 +328,7 @@ int main(int argc, char ** argv) {
                    tokens_prompt = common_tokenize(ctx, client.prompt, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
+                        common_batch_add(batch, tokens_prompt[i], client.n_past++, { client.id + 1 }, false);
                    }

                    // extract the logits only for the last token
@@ -363,10 +427,9 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                        (llama_vocab_is_eog(vocab, id) ||
-                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
-                         client.response.find("User:") != std::string::npos ||
-                         client.response.find('\n') != std::string::npos)) {
+                    (llama_vocab_is_eog(vocab, id) ||
+                     (params.n_predict > 0 && client.n_decoded >= params.n_predict) ||
+                     client.response.find("User:") != std::string::npos)) {
                    // basic reverse prompt
                    const size_t pos = client.response.find("User:");
                    if (pos != std::string::npos) {
@@ -84,13 +84,13 @@ int main(int argc, char ** argv) {
    model_params.n_gpu_layers = ngl;

    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
-    const llama_vocab * vocab = llama_model_get_vocab(model);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

+    const llama_vocab * vocab = llama_model_get_vocab(model);
    // tokenize the prompt

    // find the number of tokens in the prompt
@@ -12,16 +12,16 @@ source /opt/intel/oneapi/setvars.sh

 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
 MODEL_FILE=models/llama-2-7b.Q4_0.gguf
-NGL=33
-CONEXT=4096
+NGL=99
+CONTEXT=4096

 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none

 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
 fi
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2025 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+# If you want more control, DPC++ Allows selecting a specific device through the
+# following environment variable
+#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+source /opt/intel/oneapi/setvars.sh
+
+#export GGML_SYCL_DEBUG=1
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
+MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
+NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
+CONTEXT=4096
+
+if [ $# -gt 0 ]; then
+    GGML_SYCL_DEVICE=$1
+    echo "Using $GGML_SYCL_DEVICE as the main GPU"
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+else
+    #use multiple GPUs with same max compute units
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
+fi
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force


-.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
+.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
@@ -0,0 +1,9 @@
+::  MIT license
+::  Copyright (C) 2024 Intel Corporation
+::  SPDX-License-Identifier: MIT
+
+set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+
+
+.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
@@ -128,6 +128,8 @@ extern "C" {
    // set gradients to zero, initilize loss, and optionally reset the optimizer
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);

+    GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
+
    // get underlying tensors that store data
    // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
@@ -65,6 +65,7 @@
 #include <aclnnop/aclnn_eq_tensor.h>
 #include <aclnnop/aclnn_gt_scalar.h>
 #include <aclnnop/aclnn_pow.h>
+#include <aclnnop/aclnn_grouped_matmul_v2.h>
 #include <float.h>

 #include <cmath>
@@ -2587,3 +2588,149 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){

    ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
 }
+
+/**
+ * @brief Performs expert-specific matrix multiplication (MoE) with
+ * floating-point precision using the CANN backend.
+ *
+ * This function executes a matrix multiplication operation tailored for
+ * Mixture of Experts (MoE) models, where the input tensor is multiplied
+ * with expert-specific weight matrices. It uses the CANN backend for
+ * efficient computation and stores the result in the destination tensor `dst`.
+ * The operation may leverage identity-based optimizations or routing masks
+ * as part of sparse expert selection.
+ *
+ * @param ctx The context for executing CANN backend operations.
+ * @param dst The destination tensor where the MoE multiplication result
+ * will be stored.
+ *
+ * @note This function assumes floating-point data types and is designed for
+ * MoE architectures, possibly involving sparse expert routing.
+ */
+static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    //dst   [M, K, N, 1]
+    ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]
+    ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1
+    ggml_tensor * ids  = dst->src[2];  //ids	[K, N]
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // copy index from npu to cpu
+    int64_t n_as = ne02; // A
+    int64_t n_ids = ids->ne[0]; // K
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+    ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
+        ACL_MEMCPY_DEVICE_TO_HOST);
+    ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
+
+    char * src0_original = (char *) src0->data;
+    char * src1_original = (char *) src1->data;
+    char * dst_original  = (char *)  dst->data;
+    size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
+
+    // src0 is F16, src1 is F32, dst is F32
+    ggml_cann_pool_alloc src0_cast_allocator;
+    if (src0->type == GGML_TYPE_F16) {
+        src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
+        void* src0_cast_buf = src0_cast_allocator.get();
+
+        size_t cast_nb[GGML_MAX_DIMS];
+        cast_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
+        }
+
+        aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
+        aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
+            ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
+        GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
+        ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
+
+        src0_original = (char *) src0_cast_buf;
+        memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
+    }
+
+    std::vector<aclTensor*> src0_tensor_vec;
+    std::vector<aclTensor*> src1_tensor_vec;
+    std::vector<aclTensor*> dst_tensor_vec;
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            // src0_row [M, D] -> weight && permute
+            int64_t src0_ne[2] = {ne01, ne00};
+            size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
+            // src1_row [D, 1] -> input
+            int64_t src1_ne[2] = {ne10, 1};
+            size_t src1_nb[2] = {nb10, nb11};
+            // dst_row [M, 1] -> out
+            int64_t dst_ne[2] = {ne0, 1};
+            size_t dst_nb[2] = {nb0, nb1};
+
+            // expert index
+            int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+            // If B = 1 (broadcast), always use 0; otherwise, use id.
+            int64_t i11 = (ne11 == 1 ? 0 : id);
+            int64_t i12 = iid1;
+
+            int64_t i1 = id;
+            int64_t i2 = i12;
+
+            void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
+            void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
+            void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2;
+
+            aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
+                ACL_FLOAT, sizeof(float),
+                src0_ne, src0_nb, 2);
+            aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
+                ACL_FLOAT, sizeof(float),
+                src1_ne, src1_nb, 2);
+            aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
+                ACL_FLOAT, sizeof(float),
+                dst_ne, dst_nb, 2);
+
+            src0_tensor_vec.push_back(acl_src0);
+            src1_tensor_vec.push_back(acl_src1);
+            dst_tensor_vec.push_back(acl_dst);
+        }
+    }
+
+    // GroupedMatmulV2 required tensor_list.size < 128
+    size_t GROUP_SIZE = 128;
+    std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
+    std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
+    std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
+
+    // split and call GroupedMatmulV2
+    for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
+        size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
+        std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
+        std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
+        std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
+
+        aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
+        aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
+        aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
+
+        GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
+            nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
+
+        ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
+    }
+    return;
+}
+
+void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    const enum ggml_type type = dst->src[0]->type;
+    switch (type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_F16:
+            ggml_cann_mul_mat_id_fp(ctx, dst);
+            break;
+        default:
+            GGML_ABORT("Unsupported type for mul_mat_id");
+            break;
+    }
+}
@@ -978,6 +978,33 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
    }
 }

+/**
+ * @brief   Performs sparse expert-based matrix multiplication using the CANN backend.
+ *
+ * @details This function implements a MoE-style batched matrix multiplication, where each input token
+ *          is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
+ *          in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
+ *
+ *          For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
+ *          performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
+ *          and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
+ *
+ *          Dimensions:
+ *              - src0: [D, M, A, 1], where A is the number of experts
+ *              - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
+ *              - ids : [K, N],       where K is the number of experts each token is routed to
+ *              - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication
+ *
+ *          The function handles two main modes:
+ *              - If `ne12 == 1`, a simpler per-token loop is used.
+ *              - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the expert-weighted token outputs are stored.
+ *            Expected to be of shape [M, K, N, 1].
+ */
+void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 /**
 * @brief Applies a element-wise operation to two input tensors using the CANN
 * backend.
@@ -1672,7 +1672,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_mul_mat(ctx, dst);
            break;
        case GGML_OP_MUL_MAT_ID:
-            return false;
+            ggml_cann_mul_mat_id(ctx, dst);
+            break;
        case GGML_OP_SCALE:
            ggml_cann_scale(ctx, dst);
            break;
@@ -2030,7 +2031,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        }
        case GGML_OP_MUL_MAT_ID:
-            return false;
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F16:
+                case GGML_TYPE_F32:
+                    return true;
+                default:
+                    return false;
+            }
        // embedding
        case GGML_OP_GET_ROWS: {
            switch (op->src[0]->type) {
@@ -2,9 +2,9 @@
 #include "fattn-common.cuh"

 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#ifndef GGML_USE_HIP
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // GGML_USE_HIP
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -48,6 +48,12 @@ static __global__ void flash_attn_vec_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    if (ncols > 1) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

@@ -91,6 +97,13 @@ static __global__ void flash_attn_vec_ext_f16(
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
+
+    __shared__ half maskh_shared[ncols*D];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        maskh_shared[j*D + tid] = 0.0f;
+    }
+
    __syncthreads();

    // Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -175,6 +188,35 @@ static __global__ void flash_attn_vec_ext_f16(
    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

+        if (mask) {
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid];
+            }
+
+            __syncthreads();
+
+            // When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
+            // In such cases, skip the KV slice.
+            // On AMD __all_sync would not work correctly because it assumes a warp size of 64.
+#ifndef GGML_USE_HIP
+            bool skip = true;
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+#pragma unroll
+                for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                    const int i = i0 + threadIdx.x;
+
+                    const float2 tmp = __half22float2(((const half2 *) maskh_shared)[j*(D/2) + i]);
+                    skip = skip && isinf(tmp.x) && isinf(tmp.y);
+                }
+            }
+            if (__all_sync(0xFFFFFFFF, skip)) {
+                continue;
+            }
+#endif // GGML_USE_HIP
+        }
+
        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
@@ -202,7 +244,7 @@ static __global__ void flash_attn_vec_ext_f16(
                    sum = logit_softcap*tanhf(sum);
                }

-                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
+                sum += maskh_shared[j*D + i_KQ];

                if (ncols == 1) {
                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
@@ -335,7 +377,9 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    if (Q->ne[1] == 1) {
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+
+    if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
@@ -2,9 +2,9 @@
 #include "fattn-common.cuh"

 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#ifndef GGML_USE_HIP
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // GGML_USE_HIP
 static __global__ void flash_attn_vec_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -60,6 +60,12 @@ static __global__ void flash_attn_vec_ext_f32(
        NO_DEVICE_CODE;
        return;
    }
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    if (ncols > 1) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

@@ -104,6 +110,13 @@ static __global__ void flash_attn_vec_ext_f32(
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
+
+    __shared__ float maskf_shared[ncols*D];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        maskf_shared[j*D + tid] = 0.0f;
+    }
+
    __syncthreads();

    // Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -181,6 +194,34 @@ static __global__ void flash_attn_vec_ext_f32(
    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

+        if (mask) {
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]);
+            }
+
+            __syncthreads();
+
+            // When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
+            // In such cases, skip the KV slice.
+            // On AMD __all_sync would not work correctly because it assumes a warp size of 64.
+#ifndef GGML_USE_HIP
+            bool skip = true;
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+#pragma unroll
+                for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
+                    const int i = i0 + threadIdx.x;
+
+                    skip = skip && isinf(maskf_shared[j*D + i]);
+                }
+            }
+            if (__all_sync(0xFFFFFFFF, skip)) {
+                continue;
+            }
+#endif // GGML_USE_HIP
+        }
+
        float kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
@@ -204,7 +245,7 @@ static __global__ void flash_attn_vec_ext_f32(
                    sum = logit_softcap*tanhf(sum);
                }

-                sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
+                sum += maskf_shared[j*D + i_KQ];

                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);

@@ -326,7 +367,9 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    if (Q->ne[1] == 1) {
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+
+    if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
@@ -3255,7 +3255,7 @@ template<
    typename kd4x4_t, // key type in device memory
    short nl_k,
    void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
-    typename vd4x4_t, // key type in device memory
+    typename vd4x4_t, // value type in device memory
    short nl_v,
    void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
    short DK,        // K head size
@@ -3776,7 +3776,7 @@ template<
    typename kd4_t, // key type in device memory
    short nl_k,
    void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
-    typename vd4_t, // key type in device memory
+    typename vd4_t, // value type in device memory
    short nl_v,
    void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
    short DK,       // K head size
@@ -576,6 +576,10 @@ void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer) {
    }
 }

+bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->static_graphs;
+}
+
 struct ggml_tensor * ggml_opt_inputs(ggml_opt_context_t opt_ctx) {
    return opt_ctx->inputs;
 }
@@ -842,6 +846,7 @@ void ggml_opt_epoch(
        int64_t                 idata_split,
        ggml_opt_epoch_callback callback_train,
        ggml_opt_epoch_callback callback_eval) {
+    GGML_ASSERT(ggml_opt_static_graphs(opt_ctx) && "ggml_opt_epoch requires static graphs");
    struct ggml_tensor * inputs = ggml_opt_inputs(opt_ctx);
    struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
    struct ggml_tensor * data   = ggml_opt_dataset_data(dataset);
@@ -385,16 +385,17 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
    ggml_sycl_set_device(ctx->device);
    auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
+    SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
+#ifndef _WIN32
    // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
    // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
-    char* host_buf = (char*)malloc(size);
+    char * host_buf = (char *) malloc(size);
    memcpy(host_buf, data, size);
-    SYCL_CHECK(
-        CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
-                             .wait()));
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait()));
    free(host_buf);
+#else
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait()));
+#endif
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3027,7 +3028,7 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten
    return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
            ctx.opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf.
            dst->op == GGML_OP_MUL_MAT &&   //limit to some supported cases of Q4_0, to do for more cases.
-            dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
+            dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
 }

 static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
@@ -3150,8 +3151,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
    } else {
        constexpr bool convert_src1_to_q8_1 = false;
-        // MUL_MAT_SYCL supports reorder
-        opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
    }
    GGML_SYCL_DEBUG("call %s done\n", __func__);
@@ -54,6 +54,11 @@ if (Vulkan_FOUND)
        -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
    )

+    set(VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS "")
+    if (CMAKE_BUILD_TYPE AND CMAKE_BUILD_TYPE MATCHES "Debug|Release|MinSizeRel|RelWithDebInfo")
+        list(APPEND VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS --config=${CMAKE_BUILD_TYPE})
+    endif()
+
    # Test all shader extensions
    test_shader_extension_support(
        "GL_KHR_cooperative_matrix"
@@ -149,7 +154,7 @@ if (Vulkan_FOUND)
        vulkan-shaders-gen
        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
        CMAKE_ARGS ${VULKAN_SHADER_GEN_CMAKE_ARGS}
-        BUILD_COMMAND ${CMAKE_COMMAND} --build .
+        BUILD_COMMAND ${CMAKE_COMMAND} --build . ${VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS}
        INSTALL_COMMAND ${CMAKE_COMMAND} --install .
        INSTALL_DIR ${CMAKE_BINARY_DIR}
    )
@@ -2031,25 +2031,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
            CREATE_MM(pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3)
        }
 #endif
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_S].f16acc,   matmul_iq1_s_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_M].f16acc,   matmul_iq1_m_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0], matmul_q4_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1], matmul_q4_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0], matmul_q5_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1], matmul_q5_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0], matmul_q8_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K], matmul_q2_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K], matmul_q3_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K], matmul_q4_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K], matmul_q5_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K], matmul_q6_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_S],   matmul_iq1_s_f16,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_M],   matmul_iq1_m_f16,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS],  matmul_iq2_xs_f16,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S],   matmul_iq2_s_f16,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S],   matmul_iq3_s_f16,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS],  matmul_iq4_xs_f16,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL],  matmul_iq4_nl_f16,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)

        CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
 #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
@@ -2117,47 +2117,47 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #endif

        if (device->coopmat_acc_f16_support) {
-            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f16acc,   matmul_iq1_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f16acc,   matmul_iq1_m_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S],   matmul_iq1_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M],   matmul_iq1_m_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS],  matmul_iq2_xs_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S],   matmul_iq2_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S],   matmul_iq3_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS],  matmul_iq4_xs_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL],  matmul_iq4_nl_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
        } else {
-            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f16acc,   matmul_iq1_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f16acc,   matmul_iq1_m_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f32acc,   matmul_iq1_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f32acc,   matmul_iq1_m_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
        }

        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
@@ -2232,13 +2232,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \

-#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->mul_mat ## ID ## _l[TYPE]) { \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f16acc->l, #NAMELC "_f16acc_l", NAMELC ## _f16acc_len, NAMELC ##  _f16acc_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->l, #NAMELC        "_l", NAMELC ## _len,        NAMELC ##  _data,        "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+        } \
+        if (device->mul_mat ## ID ## _m[TYPE]) { \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f16acc->m, #NAMELC "_f16acc_m", NAMELC ## _f16acc_len, NAMELC ##  _f16acc_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->m, #NAMELC        "_m", NAMELC ## _len,        NAMELC ##  _data,        "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+        } \
+        if (device->mul_mat ## ID ## _s[TYPE]) { \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f16acc->s, #NAMELC "_f16acc_s", NAMELC ## _f16acc_len, NAMELC ##  _f16acc_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->s, #NAMELC        "_s", NAMELC ## _len,        NAMELC ##  _data,        "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+        } \

        // Create 2 variants, {f16,f32} accumulator
 #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
@@ -2252,34 +2258,34 @@ static void ggml_vk_load_shaders(vk_device& device) {

        CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );

-        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f16acc,   matmul_iq1_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f16acc,   matmul_iq1_m_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S],   matmul_iq1_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M],   matmul_iq1_m_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS],  matmul_iq2_xs_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S],   matmul_iq2_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S],   matmul_iq3_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS],  matmul_iq4_xs_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL],  matmul_iq4_nl_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
        if (device->integer_dot_product) {
-            CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0], matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1], matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0], matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1], matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0], matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
        }
 #endif

@@ -2328,13 +2334,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \

-#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC "_l", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC "_m", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC "_s", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \

        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
@@ -2366,11 +2372,11 @@ static void ggml_vk_load_shaders(vk_device& device) {

 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
        if (device->integer_dot_product) {
-            CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
        }
 #endif

@@ -3711,7 +3717,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
 }

 static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) {
-    VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
+    VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ", " << prec << ")");
    if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
        return ctx->device->pipeline_matmul_f32;
    }
@@ -3739,7 +3745,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte

    // MMQ
    if (src1_type == GGML_TYPE_Q8_1) {
-        vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc;
+        vk_matmul_pipeline pipelines = (ctx->device->fp16 && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc;

        if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
            return nullptr;
@@ -3779,9 +3785,12 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte

    if (ctx->device->coopmat2) {
        assert(src1_type == GGML_TYPE_F16);
-        return ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc;
+        return prec == GGML_PREC_DEFAULT ? ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f32acc;
    }
-    return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
+    if (ctx->device->coopmat_support) {
+        return (ctx->device->fp16 && ctx->device->coopmat_acc_f16_support && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
+    }
+    return (ctx->device->fp16 && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
 }

 static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols) {
@@ -5872,10 +5881,17 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    vk_pipeline *pipelines;
    bool small_rows = N <= get_fa_num_small_rows(path);

+    // coopmat1 does not actually support "small rows" (it needs 16 rows).
+    // So use scalar instead.
    if (small_rows && path == FA_COOPMAT1) {
        path = FA_SCALAR;
    }

+    // scalar is faster than coopmat2 when N==1
+    if (N == 1 && path == FA_COOPMAT2) {
+        path = FA_SCALAR;
+    }
+
    bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;

    switch (path) {
@@ -10254,7 +10270,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
    } else if (tensor->op == GGML_OP_CONCAT) {
        tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
    } else if (tensor->op == GGML_OP_UPSCALE) {
-        tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->op_params[0], tensor->op_params[1], (ggml_scale_mode) tensor->op_params[0]);
+        tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
    } else if (tensor->op == GGML_OP_SCALE) {
        const float * params = (const float *)tensor->op_params;
        tensor_clone = ggml_scale(ggml_ctx, src_clone[0], params[0]);
@@ -10543,7 +10559,8 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
                        ggml_vk_print_graph_origin(tensor, done);
                        GGML_ABORT("fatal error");
                    }
-                    if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
+                    const double denom = std::fabs(correct) > 1.0f ? (std::fabs(correct) > 1e-8 ? std::fabs(correct) : 1e-8) : 1.0f;
+                    if (first_error[0] == -1 && std::fabs(correct - result) / denom > 0.5) {
                        first_error[0] = i0;
                        first_error[1] = i1;
                        first_error[2] = i2;
@@ -10555,7 +10572,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
                    // Special case, value is infinite, avoid NaN result in avg_err
                    // NaN also appears in results, if both are nan error is 0
                    if (!std::isinf(correct) && !std::isinf(result) && !std::isnan(correct) && !std::isnan(result)) {
-                        avg_err += std::fabs(correct - result);
+                        avg_err += std::fabs(correct - result) / denom;
                    }
                    counter++;
                }
@@ -10590,7 +10607,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
        ggml_vk_print_graph_origin(tensor, done);
    }

-    if (avg_err > 0.05 || std::isnan(avg_err)) {
+    if (avg_err > 0.5 || std::isnan(avg_err)) {
        std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
        std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
        if (src0 != nullptr) {
@@ -9,60 +9,13 @@
 #extension GL_KHR_shader_subgroup_shuffle : enable

 #include "types.comp"
+#include "flash_attn_base.comp"

-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (constant_id = 0) const uint32_t WorkGroupSize = 128;
-layout (constant_id = 1) const uint32_t Br = 1;
-layout (constant_id = 2) const uint32_t Bc = 32;
-layout (constant_id = 3) const uint32_t D = 32;
-
-layout (constant_id = 5) const uint32_t D_split = 16;
 const uint32_t D_per_thread = D / D_split;

 const uint32_t cols_per_iter = WorkGroupSize / D_split;
 const uint32_t cols_per_thread = Bc / cols_per_iter;

-layout (push_constant) uniform parameter {
-    uint32_t N;
-    uint32_t KV;
-
-    uint32_t ne1;
-    uint32_t ne2;
-    uint32_t ne3;
-
-    uint32_t neq2;
-    uint32_t neq3;
-    uint32_t nek2;
-    uint32_t nek3;
-    uint32_t nev2;
-    uint32_t nev3;
-    uint32_t nem1;
-
-    uint32_t nb01;
-    uint32_t nb02;
-    uint32_t nb03;
-    uint32_t nb11;
-    uint32_t nb12;
-    uint32_t nb13;
-    uint32_t nb21;
-    uint32_t nb22;
-    uint32_t nb23;
-    uint32_t nb31;
-
-    float scale;
-    float max_bias;
-    float logit_softcap;
-
-    uint32_t mask;
-    uint32_t n_head_log2;
-    float m0;
-    float m1;
-
-    uint32_t gqa_ratio;
-    uint32_t split_kv;
-    uint32_t k_num;
-} p;

 layout (binding = 0) readonly buffer Q {float data_q[];};
 layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
@@ -71,39 +24,6 @@ layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
 layout (binding = 2) readonly buffer V {float16_t data_v[];};
 layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
 layout (binding = 3) readonly buffer M {float16_t data_m[];};
-layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
-
-#if defined(A_TYPE_PACKED16)
-#define BINDING_IDX_K 0
-#define BINDING_IDX_V 1
-layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
-#endif
-
-#if defined(DATA_A_Q4_0)
-#define BLOCK_BYTE_SIZE 18
-
-vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
-    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
-    uint shift = (iqs & 0x10) >> 2;
-    vui_lo >>= shift;
-    vui_hi >>= shift;
-
-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
-}
-#endif
-
-#if defined(DATA_A_Q8_0)
-#define BLOCK_BYTE_SIZE 34
-vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
-    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
-
-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
-}
-#endif
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))

 // Store the output when doing grouped query attention.
 // Rows index by Q's dimension 2, and the first N rows are valid.
@@ -114,27 +34,6 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
    return elem;
 }

-// Store column zero. This is used to save per-row m and L values for split_k.
-ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
-{
-    if (r < N && c == 0) {
-        uint32_t offset = iq2 + r;
-        data_o[o_offset + offset] = D_TYPE(elem);
-    }
-    return elem;
-}
-
-// Load the slope matrix, indexed by Q's dimension 2.
-ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
-{
-    const uint32_t h = iq2 + (r % p.gqa_ratio);
-
-    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
-    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
-
-    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
-}
-
 shared FLOAT_TYPE tmpsh[WorkGroupSize];
 shared vec4 tmpshv4[WorkGroupSize];

@@ -146,58 +45,12 @@ void main() {
    init_iq_shmem(gl_WorkGroupSize);
 #endif

-    const uint32_t tid = gl_LocalInvocationIndex;
-    const uint32_t N = p.N;
-    const uint32_t KV = p.KV;
+    init_indices();

+    const uint32_t tid = gl_LocalInvocationIndex;
    const uint32_t d_tid = gl_LocalInvocationIndex % D_split;
    const uint32_t col_tid = gl_LocalInvocationIndex / D_split;

-    uint32_t i = gl_WorkGroupID.x;
-    uint32_t split_k_index = 0;
-
-    if (p.k_num > 1) {
-        i = 0;
-        split_k_index = gl_WorkGroupID.x;
-    }
-
-    const uint32_t Tr = CEIL_DIV(N, Br);
-
-    const uint32_t start_j = split_k_index * p.split_kv / Bc;
-    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
-
-    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
-    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
-    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
-    const uint32_t iq3 = gl_WorkGroupID.z;
-
-    // broadcast factors
-    const uint32_t rk2 = p.neq2/p.nek2;
-    const uint32_t rk3 = p.neq3/p.nek3;
-
-    const uint32_t rv2 = p.neq2/p.nev2;
-    const uint32_t rv3 = p.neq3/p.nev3;
-
-    // k indices
-    const uint32_t ik3 = iq3 / rk3;
-    const uint32_t ik2 = iq2 / rk2;
-
-    // v indices
-    const uint32_t iv3 = iq3 / rv3;
-    const uint32_t iv2 = iq2 / rv2;
-
-    // nb?1 are already divided by the type size and are in units of elements.
-    // When using grouped query attention, Q is indexed by iq2, so the stride
-    // should be nb02 (which is in bytes).
-    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
-    uint32_t k_stride = p.nb11;
-    uint32_t v_stride = p.nb21;
-    // When using grouped query attention, all rows use the same mask (stride 0).
-    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
-    // that prevents the compiler from folding the "&" through the select
-    // and breaking the alignment detection.
-    uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
-
    uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;

    [[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) {
@@ -0,0 +1,162 @@
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (constant_id = 0) const uint32_t WorkGroupSize = 128;
+layout (constant_id = 1) const uint32_t Br = 1;
+layout (constant_id = 2) const uint32_t Bc = 32;
+layout (constant_id = 3) const uint32_t D = 32;
+layout (constant_id = 4) const uint32_t Clamp = 0;
+layout (constant_id = 5) const uint32_t D_split = 16;
+
+
+layout (push_constant) uniform parameter {
+    uint32_t N;
+    uint32_t KV;
+
+    uint32_t ne1;
+    uint32_t ne2;
+    uint32_t ne3;
+
+    uint32_t neq2;
+    uint32_t neq3;
+    uint32_t nek2;
+    uint32_t nek3;
+    uint32_t nev2;
+    uint32_t nev3;
+    uint32_t nem1;
+
+    uint32_t nb01;
+    uint32_t nb02;
+    uint32_t nb03;
+    uint32_t nb11;
+    uint32_t nb12;
+    uint32_t nb13;
+    uint32_t nb21;
+    uint32_t nb22;
+    uint32_t nb23;
+    uint32_t nb31;
+
+    float scale;
+    float max_bias;
+    float logit_softcap;
+
+    uint32_t mask;
+    uint32_t n_head_log2;
+    float m0;
+    float m1;
+
+    uint32_t gqa_ratio;
+    uint32_t split_kv;
+    uint32_t k_num;
+} p;
+
+layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
+
+#if defined(A_TYPE_PACKED16)
+#define BINDING_IDX_K 0
+#define BINDING_IDX_V 1
+layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
+#endif
+
+#if defined(DATA_A_Q4_0)
+#define BLOCK_BYTE_SIZE 18
+
+vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
+    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
+    uint shift = (iqs & 0x10) >> 2;
+    vui_lo >>= shift;
+    vui_hi >>= shift;
+
+    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
+}
+#endif
+
+#if defined(DATA_A_Q8_0)
+#define BLOCK_BYTE_SIZE 34
+vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
+    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
+
+    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
+}
+#endif
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+
+// Store column zero. This is used to save per-row m and L values for split_k.
+ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    if (r < N && c == 0) {
+        uint32_t offset = iq2 + r;
+        data_o[o_offset + offset] = D_TYPE(elem);
+    }
+    return elem;
+}
+
+// Load the slope matrix, indexed by Q's dimension 2.
+ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
+{
+    const uint32_t h = iq2 + (r % p.gqa_ratio);
+
+    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
+    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
+
+    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
+}
+
+uint32_t i, N, KV, split_k_index, Tr, start_j, end_j,
+         iq2, iq3, rk2, rk3, rv2, rv3, ik2, ik3, iv2, iv3,
+         q_stride, k_stride, v_stride, m_stride;
+
+void init_indices()
+{
+    N = p.N;
+    KV = p.KV;
+
+    i = gl_WorkGroupID.x;
+    split_k_index = 0;
+
+    if (p.k_num > 1) {
+        i = 0;
+        split_k_index = gl_WorkGroupID.x;
+    }
+
+    Tr = CEIL_DIV(N, Br);
+
+    start_j = split_k_index * p.split_kv / Bc;
+    end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
+
+    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
+    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
+    iq2 = gl_WorkGroupID.y * p.gqa_ratio;
+    iq3 = gl_WorkGroupID.z;
+
+    // broadcast factors
+    rk2 = p.neq2/p.nek2;
+    rk3 = p.neq3/p.nek3;
+
+    rv2 = p.neq2/p.nev2;
+    rv3 = p.neq3/p.nev3;
+
+    // k indices
+    ik3 = iq3 / rk3;
+    ik2 = iq2 / rk2;
+
+    // v indices
+    iv3 = iq3 / rv3;
+    iv2 = iq2 / rv2;
+
+    // nb?1 are already divided by the type size and are in units of elements.
+    // When using grouped query attention, Q is indexed by iq2, so the stride
+    // should be nb02 (which is in bytes).
+    q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
+    k_stride = p.nb11;
+    v_stride = p.nb21;
+    // When using grouped query attention, all rows use the same mask (stride 0).
+    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
+    // that prevents the compiler from folding the "&" through the select
+    // and breaking the alignment detection.
+    m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
+}
@@ -11,14 +11,7 @@
 #extension GL_KHR_cooperative_matrix : enable

 #include "types.comp"
-
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (constant_id = 1) const uint32_t Br = 1;
-layout (constant_id = 2) const uint32_t Bc = 32;
-layout (constant_id = 3) const uint32_t D = 32;
-
-layout (constant_id = 5) const uint32_t D_split = 16;
+#include "flash_attn_base.comp"

 const uint32_t D_per_thread = D / D_split;
 const uint32_t row_split = 4;
@@ -26,46 +19,6 @@ const uint32_t rows_per_thread = Br / row_split;
 const uint32_t cols_per_iter = gl_WorkGroupSize.x / D_split / row_split;
 const uint32_t cols_per_thread = Bc / cols_per_iter;

-layout (push_constant) uniform parameter {
-    uint32_t N;
-    uint32_t KV;
-
-    uint32_t ne1;
-    uint32_t ne2;
-    uint32_t ne3;
-
-    uint32_t neq2;
-    uint32_t neq3;
-    uint32_t nek2;
-    uint32_t nek3;
-    uint32_t nev2;
-    uint32_t nev3;
-    uint32_t nem1;
-
-    uint32_t nb01;
-    uint32_t nb02;
-    uint32_t nb03;
-    uint32_t nb11;
-    uint32_t nb12;
-    uint32_t nb13;
-    uint32_t nb21;
-    uint32_t nb22;
-    uint32_t nb23;
-    uint32_t nb31;
-
-    float scale;
-    float max_bias;
-    float logit_softcap;
-
-    uint32_t mask;
-    uint32_t n_head_log2;
-    float m0;
-    float m1;
-
-    uint32_t gqa_ratio;
-    uint32_t split_kv;
-    uint32_t k_num;
-} p;

 layout (binding = 0) readonly buffer Q {float data_q[];};
 layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
@@ -74,39 +27,6 @@ layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
 layout (binding = 2) readonly buffer V {float16_t data_v[];};
 layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
 layout (binding = 3) readonly buffer M {float16_t data_m[];};
-layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
-
-#if defined(A_TYPE_PACKED16)
-#define BINDING_IDX_K 0
-#define BINDING_IDX_V 1
-layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
-#endif
-
-#if defined(DATA_A_Q4_0)
-#define BLOCK_BYTE_SIZE 18
-
-vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
-    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
-    uint shift = (iqs & 0x10) >> 2;
-    vui_lo >>= shift;
-    vui_hi >>= shift;
-
-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
-}
-#endif
-
-#if defined(DATA_A_Q8_0)
-#define BLOCK_BYTE_SIZE 34
-vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
-    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
-
-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
-}
-#endif
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))

 // Store the output when doing grouped query attention.
 // Rows index by Q's dimension 2, and the first N rows are valid.
@@ -117,27 +37,6 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
    return elem;
 }

-// Store column zero. This is used to save per-row m and L values for split_k.
-ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
-{
-    if (r < N && c == 0) {
-        uint32_t offset = iq2 + r;
-        data_o[o_offset + offset] = D_TYPE(elem);
-    }
-    return elem;
-}
-
-// Load the slope matrix, indexed by Q's dimension 2.
-ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
-{
-    const uint32_t h = iq2 + (r % p.gqa_ratio);
-
-    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
-    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
-
-    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
-}
-
 // These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd
 const uint32_t MatBr = 16;
 const uint32_t MatBc = 16;
@@ -162,9 +61,9 @@ void main() {
    init_iq_shmem(gl_WorkGroupSize);
 #endif

+    init_indices();
+
    const uint32_t tid = gl_LocalInvocationIndex;
-    const uint32_t N = p.N;
-    const uint32_t KV = p.KV;

    const uint32_t threads_per_rowgroup = gl_WorkGroupSize.x / row_split;
    const uint32_t row_tid = gl_LocalInvocationIndex / threads_per_rowgroup;
@@ -173,51 +72,6 @@ void main() {

 #define tile_row(r) (row_tid * rows_per_thread + (r))

-    uint32_t i = gl_WorkGroupID.x;
-    uint32_t split_k_index = 0;
-
-    if (p.k_num > 1) {
-        i = 0;
-        split_k_index = gl_WorkGroupID.x;
-    }
-
-    const uint32_t Tr = CEIL_DIV(N, Br);
-
-    const uint32_t start_j = split_k_index * p.split_kv / Bc;
-    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
-
-    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
-    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
-    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
-    const uint32_t iq3 = gl_WorkGroupID.z;
-
-    // broadcast factors
-    const uint32_t rk2 = p.neq2/p.nek2;
-    const uint32_t rk3 = p.neq3/p.nek3;
-
-    const uint32_t rv2 = p.neq2/p.nev2;
-    const uint32_t rv3 = p.neq3/p.nev3;
-
-    // k indices
-    const uint32_t ik3 = iq3 / rk3;
-    const uint32_t ik2 = iq2 / rk2;
-
-    // v indices
-    const uint32_t iv3 = iq3 / rv3;
-    const uint32_t iv2 = iq2 / rv2;
-
-    // nb?1 are already divided by the type size and are in units of elements.
-    // When using grouped query attention, Q is indexed by iq2, so the stride
-    // should be nb02 (which is in bytes).
-    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
-    uint32_t k_stride = p.nb11;
-    uint32_t v_stride = p.nb21;
-    // When using grouped query attention, all rows use the same mask (stride 0).
-    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
-    // that prevents the compiler from folding the "&" through the select
-    // and breaking the alignment detection.
-    uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
-
    uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;

    [[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) {
@@ -18,62 +18,12 @@

 #include "types.comp"
 #include "dequant_funcs_cm2.comp"
-
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (constant_id = 1) const uint32_t Br = 32;
-layout (constant_id = 2) const uint32_t Bc = 32;
-layout (constant_id = 3) const uint32_t D = 32;
-layout (constant_id = 4) const uint32_t Clamp = gl_CooperativeMatrixClampModeConstantNV;
-
-layout (push_constant) uniform parameter {
-    uint32_t N;
-    uint32_t KV;
-
-    uint32_t ne1;
-    uint32_t ne2;
-    uint32_t ne3;
-
-    uint32_t neq2;
-    uint32_t neq3;
-    uint32_t nek2;
-    uint32_t nek3;
-    uint32_t nev2;
-    uint32_t nev3;
-    uint32_t nem1;
-
-    uint32_t nb01;
-    uint32_t nb02;
-    uint32_t nb03;
-    uint32_t nb11;
-    uint32_t nb12;
-    uint32_t nb13;
-    uint32_t nb21;
-    uint32_t nb22;
-    uint32_t nb23;
-    uint32_t nb31;
-
-    float scale;
-    float max_bias;
-    float logit_softcap;
-
-    uint32_t mask;
-    uint32_t n_head_log2;
-    float m0;
-    float m1;
-
-    uint32_t gqa_ratio;
-    uint32_t split_kv;
-    uint32_t k_num;
-} p;
+#include "flash_attn_base.comp"

 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
 layout (binding = 1) readonly buffer K {uint8_t data_k[];};
 layout (binding = 2) readonly buffer V {uint8_t data_v[];};
 layout (binding = 3) readonly buffer M {uint8_t data_m[];};
-layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))

 ACC_TYPE maxReduce(const in ACC_TYPE x, const in ACC_TYPE y) {
    return max(x, y);
@@ -118,67 +68,12 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
    return elem;
 }

-// Store column zero. This is used to save per-row m and L values for split_k.
-ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
-{
-    if (r < N && c == 0) {
-        uint32_t offset = iq2 + r;
-        data_o[o_offset + offset] = D_TYPE(elem);
-    }
-    return elem;
-}
-
-// Load the slope matrix, indexed by Q's dimension 2.
-ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
-{
-    const uint32_t h = iq2 + (r % p.gqa_ratio);
-
-    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
-    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
-
-    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
-}
-
 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
    init_iq_shmem(gl_WorkGroupSize);
 #endif

-    const uint32_t N = p.N;
-    const uint32_t KV = p.KV;
-
-    uint32_t i = gl_WorkGroupID.x;
-    uint32_t split_k_index = 0;
-
-    if (p.k_num > 1) {
-        i = 0;
-        split_k_index = gl_WorkGroupID.x;
-    }
-
-    const uint32_t Tr = CEIL_DIV(N, Br);
-
-    const uint32_t start_j = split_k_index * p.split_kv / Bc;
-    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
-
-    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
-    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
-    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
-    const uint32_t iq3 = gl_WorkGroupID.z;
-
-    // broadcast factors
-    const uint32_t rk2 = p.neq2/p.nek2;
-    const uint32_t rk3 = p.neq3/p.nek3;
-
-    const uint32_t rv2 = p.neq2/p.nev2;
-    const uint32_t rv3 = p.neq3/p.nev3;
-
-    // k indices
-    const uint32_t ik3 = iq3 / rk3;
-    const uint32_t ik2 = iq2 / rk2;
-
-    // v indices
-    const uint32_t iv3 = iq3 / rv3;
-    const uint32_t iv2 = iq2 / rv2;
+    init_indices();

    tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutQ = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
    tensorLayoutNV<2, Clamp> tensorLayoutK = createTensorLayoutNV(2, Clamp);
@@ -195,17 +90,6 @@ void main() {
    tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
    tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);

-    // nb?1 are already divided by the type size and are in units of elements.
-    // When using grouped query attention, Q is indexed by iq2, so the stride
-    // should be nb02 (which is in bytes).
-    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
-    uint32_t k_stride = p.nb11;
-    uint32_t v_stride = p.nb21;
-    // When using grouped query attention, all rows use the same mask (stride 0).
-    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
-    // that prevents the compiler from folding the "&" through the select
-    // and breaking the alignment detection.
-    uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
    // hint to the compiler that strides are aligned for the aligned variant of the shader
    if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
    {
@@ -64,12 +64,17 @@
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];

-#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
-    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
+#if defined(__linux__) || \
+    defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+    (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
+
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
+#if defined(__linux__)
+#include <sys/prctl.h>
+#endif

 #if defined(__ANDROID__)
 #include <unwind.h>
@@ -133,10 +138,36 @@ static void ggml_print_backtrace(void) {
    if (GGML_NO_BACKTRACE) {
        return;
    }
-    char attach[32];
-    snprintf(attach, sizeof(attach), "attach %d", getpid());
-    int pid = fork();
-    if (pid == 0) {
+#if defined(__linux__)
+    FILE * f = fopen("/proc/self/status", "r");
+    size_t size = 0;
+    char * line = NULL;
+    ssize_t length = 0;
+    while ((length = getline(&line, &size, f)) > 0) {
+        if (!strncmp(line, "TracerPid:", sizeof("TracerPid:") - 1) &&
+            (length != sizeof("TracerPid:\t0\n") - 1 || line[length - 2] != '0')) {
+            // Already being debugged, and the breakpoint is the later abort()
+            free(line);
+            fclose(f);
+            return;
+        }
+    }
+    free(line);
+    fclose(f);
+    int lock[2] = { -1, -1 };
+    (void) !pipe(lock); // Don't start gdb until after PR_SET_PTRACER
+#endif
+    const int parent_pid = getpid();
+    const int child_pid = fork();
+    if (child_pid < 0) { // error
+        return;
+    } else if (child_pid == 0) { // child
+        char attach[32];
+        snprintf(attach, sizeof(attach), "attach %d", parent_pid);
+#if defined(__linux__)
+        close(lock[1]);
+        (void) !read(lock[0], lock, 1);
+#endif
        // try gdb
        execlp("gdb", "gdb", "--batch",
            "-ex", "set style enabled on",
@@ -149,18 +180,18 @@ static void ggml_print_backtrace(void) {
        execlp("lldb", "lldb", "--batch",
            "-o", "bt",
            "-o", "quit",
-            "-p", attach,
+            "-p", &attach[sizeof("attach ") - 1],
            (char *) NULL);
-        exit(EXIT_FAILURE);
-    } else {
-        int wstatus;
-        waitpid(pid, &wstatus, 0);
-        if (WIFEXITED(wstatus)) {
-            if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
-                // gdb failed, fallback to backtrace_symbols
-                ggml_print_backtrace_symbols();
-            }
-        }
+        // gdb failed, fallback to backtrace_symbols
+        ggml_print_backtrace_symbols();
+        _Exit(0);
+    } else { // parent
+#if defined(__linux__)
+        prctl(PR_SET_PTRACER, child_pid);
+        close(lock[1]);
+        close(lock[0]);
+#endif
+        waitpid(child_pid, NULL, 0);
    }
 }
 #else
@@ -482,14 +482,15 @@ class MODEL_TENSOR(IntEnum):
    V_ENC_EMBD_CLS       = auto()
    V_ENC_EMBD_PATCH     = auto()
    V_ENC_EMBD_POS       = auto()
+    V_ENC_INPUT_NORM     = auto()
    V_ENC_ATTN_Q         = auto()
    V_ENC_ATTN_Q_NORM    = auto()
    V_ENC_ATTN_K         = auto()
    V_ENC_ATTN_K_NORM    = auto()
    V_ENC_ATTN_V         = auto()
-    V_ENC_INPUT_NORM     = auto()
-    V_ENC_OUTPUT         = auto()
-    V_ENC_OUTPUT_NORM    = auto()
+    V_ENC_ATTN_O         = auto()
+    V_ENC_ATTN_O_NORM    = auto()
+    V_ENC_POST_ATTN_NORM = auto()
    V_ENC_FFN_UP         = auto()
    V_ENC_FFN_GATE       = auto()
    V_ENC_FFN_DOWN       = auto()
@@ -749,8 +750,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_ENC_ATTN_K_NORM:         "v.blk.{bid}.attn_k_norm",
    MODEL_TENSOR.V_ENC_ATTN_V:              "v.blk.{bid}.attn_v",
    MODEL_TENSOR.V_ENC_INPUT_NORM:          "v.blk.{bid}.ln1",
-    MODEL_TENSOR.V_ENC_OUTPUT:              "v.blk.{bid}.attn_out",
-    MODEL_TENSOR.V_ENC_OUTPUT_NORM:         "v.blk.{bid}.ln2",
+    MODEL_TENSOR.V_ENC_ATTN_O:              "v.blk.{bid}.attn_out",
+    MODEL_TENSOR.V_ENC_ATTN_O_NORM:         "v.blk.{bid}.attn_out_norm",
+    MODEL_TENSOR.V_ENC_POST_ATTN_NORM:      "v.blk.{bid}.ln2",
    MODEL_TENSOR.V_ENC_FFN_UP:              "v.blk.{bid}.ffn_up",
    MODEL_TENSOR.V_ENC_FFN_GATE:            "v.blk.{bid}.ffn_gate",
    MODEL_TENSOR.V_ENC_FFN_DOWN:            "v.blk.{bid}.ffn_down",
@@ -785,14 +787,15 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_ENC_EMBD_CLS,
        MODEL_TENSOR.V_ENC_EMBD_PATCH,
        MODEL_TENSOR.V_ENC_EMBD_POS,
+        MODEL_TENSOR.V_ENC_INPUT_NORM,
        MODEL_TENSOR.V_ENC_ATTN_Q,
        MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
        MODEL_TENSOR.V_ENC_ATTN_K,
        MODEL_TENSOR.V_ENC_ATTN_K_NORM,
        MODEL_TENSOR.V_ENC_ATTN_V,
-        MODEL_TENSOR.V_ENC_INPUT_NORM,
-        MODEL_TENSOR.V_ENC_OUTPUT,
-        MODEL_TENSOR.V_ENC_OUTPUT_NORM,
+        MODEL_TENSOR.V_ENC_ATTN_O,
+        MODEL_TENSOR.V_ENC_ATTN_O_NORM,
+        MODEL_TENSOR.V_ENC_POST_ATTN_NORM,
        MODEL_TENSOR.V_ENC_FFN_UP,
        MODEL_TENSOR.V_ENC_FFN_GATE,
        MODEL_TENSOR.V_ENC_FFN_DOWN,
@@ -2180,6 +2183,7 @@ class VisionProjectorType:
    GEMMA3 = "gemma3"
    IDEFICS3 = "idefics3"
    PIXTRAL = "pixtral"
+    LLAMA4 = "llama4"
    QWEN2VL = "qwen2vl_merger"
    QWEN25VL = "qwen2.5vl_merger"
    INTERNVL = "internvl"
@@ -902,10 +902,12 @@ class TensorNameMap:

        MODEL_TENSOR.V_MMPROJ_FC: (
            "model.connector.modality_projection.proj", # SmolVLM
+            "multi_modal_projector.linear_1", # llama 4
        ),

        MODEL_TENSOR.V_MMPROJ_MLP: (
            "model.mm_projector.mlp.mlp.{bid}",
+            "vision_model.vision_adapter.mlp.fc{bid}", # llama 4
            "mlp1.{bid}", # InternVL
        ),

@@ -915,6 +917,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_EMBD_CLS: (
            "vision_tower.vision_model.embeddings.class_embedding",
+            "vision_model.class_embedding", # llama 4
        ),

        MODEL_TENSOR.V_ENC_EMBD_PATCH: (
@@ -922,6 +925,7 @@ class TensorNameMap:
            "vpm.embeddings.patch_embedding",
            "model.vision_model.embeddings.patch_embedding", # SmolVLM
            "vision_tower.patch_conv", # pixtral
+            "vision_model.patch_embedding.linear", # llama 4
            "visual.patch_embed.proj", # qwen2vl
        ),

@@ -929,12 +933,14 @@ class TensorNameMap:
            "vision_tower.vision_model.embeddings.position_embedding",
            "vpm.embeddings.position_embedding",
            "model.vision_model.embeddings.position_embedding", # SmolVLM
+            "vision_model.positional_embedding_vlm", # llama 4
        ),

        MODEL_TENSOR.V_ENC_ATTN_Q: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
            "vpm.encoder.layers.{bid}.self_attn.q_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
+            "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
            "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
            "visual.blocks.{bid}.attn.q", # qwen2vl, generated
        ),
@@ -947,6 +953,7 @@ class TensorNameMap:
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
            "vpm.encoder.layers.{bid}.self_attn.k_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
+            "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
            "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
            "visual.blocks.{bid}.attn.k", # qwen2vl, generated
        ),
@@ -959,6 +966,7 @@ class TensorNameMap:
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
            "vpm.encoder.layers.{bid}.self_attn.v_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
+            "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
            "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
            "visual.blocks.{bid}.attn.v", # qwen2vl, generated
        ),
@@ -969,23 +977,26 @@ class TensorNameMap:
            "vpm.encoder.layers.{bid}.layer_norm1",
            "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
            "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
+            "vision_model.model.layers.{bid}.input_layernorm", # llama4
            "visual.blocks.{bid}.norm1", # qwen2vl
        ),

-        MODEL_TENSOR.V_ENC_OUTPUT: (
+        MODEL_TENSOR.V_ENC_ATTN_O: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
            "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
            "vpm.encoder.layers.{bid}.self_attn.out_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
+            "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
            "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
            "visual.blocks.{bid}.attn.proj", # qwen2vl
        ),

-        MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
+        MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
            "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
            "vpm.encoder.layers.{bid}.layer_norm2",
            "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
+            "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
            "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
            "visual.blocks.{bid}.norm2", # qwen2vl
        ),
@@ -995,6 +1006,7 @@ class TensorNameMap:
            "vpm.encoder.layers.{bid}.mlp.fc1",
            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
            "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
+            "vision_model.model.layers.{bid}.mlp.fc1", # llama4
            "visual.blocks.{bid}.mlp.fc1", # qwen2vl
            "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
        ),
@@ -1009,6 +1021,7 @@ class TensorNameMap:
            "vpm.encoder.layers.{bid}.mlp.fc2",
            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
            "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
+            "vision_model.model.layers.{bid}.mlp.fc2", # llama4
            "visual.blocks.{bid}.mlp.fc2", # qwen2vl
            "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
        ),
@@ -1024,11 +1037,13 @@ class TensorNameMap:
        MODEL_TENSOR.V_PRE_NORM: (
            "vision_tower.vision_model.pre_layrnorm",
            "vision_tower.ln_pre", # pixtral
+            "vision_model.layernorm_pre", # llama4
        ),

        MODEL_TENSOR.V_POST_NORM: (
            "vision_tower.vision_model.post_layernorm",
            "model.vision_model.post_layernorm", # SmolVLM
+            "vision_model.layernorm_post", # llama4
            "visual.merger.ln_q", # qwen2vl
        ),

@@ -361,10 +361,11 @@ extern "C" {

        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
        bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
-        bool op_offload;  // whether to offload host tensor operations to device
+        bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // use flash attention [EXPERIMENTAL]
+        bool no_perf;     // measure performance timings
+        bool op_offload;  // offload host tensor operations to device
+        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    };

    // model quantization parameters
@@ -607,72 +608,13 @@ extern "C" {
    // KV cache
    //

-    // TODO: start using struct llama_kv_cache
-
-    // Information associated with an individual cell in the KV cache view.
-    struct llama_kv_cache_view_cell {
-        // The position for this cell. Takes KV cache shifts into account.
-        // May be negative if the cell is not populated.
-        llama_pos pos;
-    };
-
-    // An updateable view of the KV cache.
-    struct llama_kv_cache_view {
-        // Number of KV cache cells. This will be the same as the context size.
-        int32_t n_cells;
-
-        // Maximum number of sequences that can exist in a cell. It's not an error
-        // if there are more sequences in a cell than this value, however they will
-        // not be visible in the view cells_sequences.
-        int32_t n_seq_max;
-
-        // Number of tokens in the cache. For example, if there are two populated
-        // cells, the first with 1 sequence id in it and the second with 2 sequence
-        // ids then you'll have 3 tokens.
-        int32_t token_count;
-
-        // Number of populated cache cells.
-        int32_t used_cells;
-
-        // Maximum contiguous empty slots in the cache.
-        int32_t max_contiguous;
-
-        // Index to the start of the max_contiguous slot range. Can be negative
-        // when cache is full.
-        int32_t max_contiguous_idx;
-
-        // Information for an individual cell.
-        struct llama_kv_cache_view_cell * cells;
-
-        // The sequences for each cell. There will be n_seq_max items per cell.
-        llama_seq_id * cells_sequences;
-    };
-
-    // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
-
-    // Free a KV cache view. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-
-    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
-    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-
-    ///
-
    // Returns the number of tokens in the KV cache (slow, use only for debug)
    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);

-    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "use llama_kv_self_n_tokens instead");
-
    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);

-    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
-            "use llama_kv_self_used_cells instead");
-
    // Clear the KV cache - both cell info is erased and KV data is zeroed
    LLAMA_API void llama_kv_self_clear(
            struct llama_context * ctx);
@@ -730,10 +672,18 @@ extern "C" {
                       llama_pos   p1,
                             int   d);

+    // Returns the smallest position present in the KV cache for the specified sequence
+    // This is typically non-zero only for SWA caches
+    // Return -1 if the sequence is empty
+    LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+
    // Returns the largest position present in the KV cache for the specified sequence
+    // Return -1 if the sequence is empty
    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
            struct llama_context * ctx,
-                     llama_seq_id   seq_id);
+                    llama_seq_id   seq_id);

    // Defragment the KV cache
    // This will be applied:
@@ -747,61 +697,6 @@ extern "C" {
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);

-    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx),
-            "use llama_kv_self_clear instead");
-
-    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1),
-            "use llama_kv_self_seq_rm instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1),
-            "use llama_kv_self_seq_cp instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "use llama_kv_self_seq_keep instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta),
-            "use llama_kv_self_seq_add instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d),
-            "use llama_kv_self_seq_div instead");
-
-    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "use llama_kv_self_seq_pos_max instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
-            "use llama_kv_self_defrag instead");
-
-    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
-            "use llama_kv_self_can_shift instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
-            "use llama_kv_self_update instead");
-
-
    //
    // State / sessions
    //
@@ -943,9 +838,12 @@ extern "C" {
    // Requires KV cache.
    // For encode-decoder contexts, processes the batch using the decoder.
    // Positive return values does not mean a fatal error, but rather a warning.
-    //   0 - success
-    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    // < 0 - error. the KV cache state is restored to the state before this call
+    // Upon non-zero return values, the KV cache state is restored to the state before this call
+    //    0 - success
+    //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    //    2 - aborted
+    //   -1 - invalid input batch
+    // < -1 - error
    LLAMA_API int32_t llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);
@@ -1 +1 @@
-9b048bb72b811f50b0c30d9e5c84d6ff9f4bf005
+7c06c10c532a6cda913c17fc56341e8880ae341d
@@ -93,6 +93,7 @@ llama_context::llama_context(
    }

    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
    cparams.op_offload = params.op_offload;

    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
@@ -176,8 +177,9 @@ llama_context::llama_context(
    // init the memory module
    if (!hparams.vocab_only) {
        llama_memory_params params_mem = {
-            /*.type_k =*/ params.type_k,
-            /*.type_v =*/ params.type_v,
+            /*.type_k   =*/ params.type_k,
+            /*.type_v   =*/ params.type_v,
+            /*.swa_full =*/ params.swa_full,
        };

        memory.reset(model.create_memory(params_mem, cparams));
@@ -947,8 +949,6 @@ int llama_context::decode(llama_batch & inp_batch) {

        // find KV slot
        if (!kv_self->find_slot(ubatch)) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
            return 1;
        }

@@ -2093,6 +2093,7 @@ llama_context_params llama_context_default_params() {
        /*.flash_attn                  =*/ false,
        /*.no_perf                     =*/ true,
        /*.op_offload                  =*/ true,
+        /*.swa_full                    =*/ true,
    };

    return result;
@@ -2287,39 +2288,10 @@ int32_t llama_apply_adapter_cvec(
    return res ? 0 : -1;
 }

-//
-// kv cache view
-//
-
-llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
-    const auto * kv = ctx->get_kv_self();
-    if (kv == nullptr) {
-        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
-        return {};
-    }
-
-    return llama_kv_cache_view_init(*kv, n_seq_max);
-}
-
-void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
-    const auto * kv = ctx->get_kv_self();
-    if (kv == nullptr) {
-        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
-        return;
-    }
-
-    llama_kv_cache_view_update(view, kv);
-}
-
 //
 // kv cache
 //

-// deprecated
-int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
-    return llama_kv_self_n_tokens(ctx);
-}
-
 int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
    const auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2329,11 +2301,6 @@ int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
    return kv->get_n_tokens();
 }

-// deprecated
-int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
-    return llama_kv_self_used_cells(ctx);
-}
-
 int32_t llama_kv_self_used_cells(const llama_context * ctx) {
    const auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2343,11 +2310,6 @@ int32_t llama_kv_self_used_cells(const llama_context * ctx) {
    return kv->get_used_cells();
 }

-// deprecated
-void llama_kv_cache_clear(llama_context * ctx) {
-    llama_kv_self_clear(ctx);
-}
-
 void llama_kv_self_clear(llama_context * ctx) {
    auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2357,15 +2319,6 @@ void llama_kv_self_clear(llama_context * ctx) {
    kv->clear();
 }

-// deprecated
-bool llama_kv_cache_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
-}
-
 bool llama_kv_self_seq_rm(
        llama_context * ctx,
         llama_seq_id   seq_id,
@@ -2379,16 +2332,6 @@ bool llama_kv_self_seq_rm(
    return kv->seq_rm(seq_id, p0, p1);
 }

-// deprecated
-void llama_kv_cache_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
-}
-
 void llama_kv_self_seq_cp(
        llama_context * ctx,
         llama_seq_id   seq_id_src,
@@ -2403,13 +2346,6 @@ void llama_kv_self_seq_cp(
    kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }

-// deprecated
-void llama_kv_cache_seq_keep(
-        llama_context * ctx,
-         llama_seq_id   seq_id) {
-    llama_kv_self_seq_keep(ctx, seq_id);
-}
-
 void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
    auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2419,16 +2355,6 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
    kv->seq_keep(seq_id);
 }

-// deprecated
-void llama_kv_cache_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
-}
-
 void llama_kv_self_seq_add(
        llama_context * ctx,
         llama_seq_id   seq_id,
@@ -2443,16 +2369,6 @@ void llama_kv_self_seq_add(
    kv->seq_add(seq_id, p0, p1, delta);
 }

-// deprecated
-void llama_kv_cache_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
-}
-
 void llama_kv_self_seq_div(
        llama_context * ctx,
         llama_seq_id   seq_id,
@@ -2467,25 +2383,24 @@ void llama_kv_self_seq_div(
    kv->seq_div(seq_id, p0, p1, d);
 }

-// deprecated
-llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_self_seq_pos_max(ctx, seq_id);
+llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
+    const auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return -1;
+    }
+
+    return kv->seq_pos_min(seq_id);
 }

 llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
    const auto * kv = ctx->get_kv_self();
    if (!kv) {
-        return 0;
+        return -1;
    }

    return kv->seq_pos_max(seq_id);
 }

-// deprecated
-void llama_kv_cache_defrag(llama_context * ctx) {
-    llama_kv_self_defrag(ctx);
-}
-
 void llama_kv_self_defrag(llama_context * ctx) {
    auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2496,11 +2411,6 @@ void llama_kv_self_defrag(llama_context * ctx) {
    kv->defrag_sched(-1.0f);
 }

-// deprecated
-bool llama_kv_cache_can_shift(const llama_context * ctx) {
-    return llama_kv_self_can_shift(ctx);
-}
-
 bool llama_kv_self_can_shift(const llama_context * ctx) {
    const auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2510,11 +2420,6 @@ bool llama_kv_self_can_shift(const llama_context * ctx) {
    return kv->get_can_shift();
 }

-// deprecated
-void llama_kv_cache_update(llama_context * ctx) {
-    llama_kv_self_update(ctx);
-}
-
 // llama state API

 // deprecated
@@ -2637,7 +2542,21 @@ int32_t llama_encode(
 int32_t llama_decode(
        llama_context * ctx,
          llama_batch   batch) {
-    const int ret = ctx->decode(batch);
+    int ret = ctx->decode(batch);
+
+    // defrag and try again
+    // TODO: distinguish return code when we are sure that even after defrag there is no space available
+    if (ret == 1) {
+        llama_kv_self_defrag(ctx);
+        ret = ctx->decode(batch);
+
+        if (ret == 1) {
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+
+            return ret;
+        }
+    }
+
    if (ret != 0) {
        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
    }
@@ -9,33 +9,6 @@
 #include <cmath>
 #include <cstring>

-static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
-    // TODO move to hparams if a T5 variant appears that uses a different value
-    const int64_t max_distance = 128;
-
-    if (bidirectional) {
-        n_buckets >>= 1;
-    }
-
-    const int64_t max_exact = n_buckets >> 1;
-
-    int32_t relative_position = x - y;
-    int32_t relative_bucket = 0;
-
-    if (bidirectional) {
-        relative_bucket += (relative_position > 0) * n_buckets;
-        relative_position = abs(relative_position);
-    } else {
-        relative_position = -std::min<int32_t>(relative_position, 0);
-    }
-
-    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
-    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
-    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
-
-    return relative_bucket;
-}
-
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
        const int64_t n_tokens = ubatch->n_tokens;
@@ -110,22 +83,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {

 void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
    if (pos_bucket) {
-        const int64_t n_tokens = ubatch->n_tokens;
-
-        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
-        GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
-
-        int32_t * data = (int32_t *) pos_bucket->data;
-
-        const int64_t n_kv = kv_self->n;
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_kv; ++i) {
-                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
-                }
-            }
-        }
+        kv_self->set_input_pos_bucket(pos_bucket, ubatch);
    }
 }

@@ -403,99 +361,18 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 }

 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
-    if (self_kq_mask || self_kq_mask_swa) {
-        const int64_t n_kv         = kv_self->n;
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
+    if (self_kq_mask) {
+        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
+}

-        float * data     = nullptr;
-        float * data_swa = nullptr;
+void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
+    if (self_kq_mask) {
+        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }

-        if (self_kq_mask) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
-            data = (float *) self_kq_mask->data;
-        }
-
-        if (self_kq_mask_swa) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
-            data_swa = (float *) self_kq_mask_swa->data;
-        }
-
-        // Use only the previous KV cells of the correct sequence for each token of the ubatch.
-        // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
-        // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
-        //   Causal mask:
-        //      xxx-------
-        //      xxxx------
-        //      xxxxx-----
-        //   Non-causal mask:
-        //      xxxxx-----
-        //      xxxxx-----
-        //      xxxxx-----
-        // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
-        for (int h = 0; h < 1; ++h) {
-            for (int s = 0; s < n_seqs; ++s) {
-                const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-                for (int j = 0; j < n_seq_tokens; ++j) {
-                    const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
-                    for (int i = 0; i < n_kv; ++i) {
-                        float f;
-                        // mask the token if:
-                        if (!kv_self->cells[i].has_seq_id(seq_id) // not the correct sequence
-                            || (cparams.causal_attn && kv_self->cells[i].pos > pos) // for causal, mask future tokens
-                        ) {
-                            f = -INFINITY;
-                        } else {
-                            if (hparams.use_alibi) {
-                                f = -std::abs(kv_self->cells[i].pos - pos);
-                            } else {
-                                f = 0.0f;
-                            }
-                        }
-
-                        if (data) {
-                            data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                        }
-
-                        // may need to cut off old tokens for sliding window
-                        // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
-                        if (data_swa) {
-                            if (hparams.n_attn_chunk) {
-                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
-                                if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
-                                    f = -INFINITY;
-                                }
-                            } else {
-                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
-                                    f = -INFINITY;
-                                }
-                            }
-                            data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                        }
-                    }
-                }
-            }
-
-            // mask padded tokens
-            if (data) {
-                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                    for (int j = 0; j < n_kv; ++j) {
-                        data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                    }
-                }
-            }
-
-            // mask padded tokens
-            if (data_swa) {
-                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                    for (int j = 0; j < n_kv; ++j) {
-                        data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                    }
-                }
-            }
-        }
+    if (self_kq_mask_swa) {
+        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
    }
 }

@@ -545,7 +422,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    n_layer          (hparams.n_layer),
    n_rot            (hparams.n_rot),
    n_ctx            (cparams.n_ctx),
-    n_ctx_per_seq    (cparams.n_ctx / cparams.n_seq_max),
    n_head           (hparams.n_head()),
    n_head_kv        (hparams.n_head_kv()),
    n_embd_head_k    (hparams.n_embd_head_k),
@@ -1153,7 +1029,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {

    auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);

-    const auto n_kv = kv_self->n;
+    const auto n_kv = kv_self->get_n();

    auto & cur = inp->pos_bucket;

@@ -1188,16 +1064,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         ggml_tensor * kq_b,
         ggml_tensor * kq_mask,
         ggml_tensor * v_mla,
-             bool      v_trans,
             float     kq_scale) const {
-  //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-  //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+    const bool v_trans = v->nb[1] > v->nb[2];

-  //const int64_t n_head    = hparams.n_head(il);
-  //const int64_t n_head_kv = hparams.n_head_kv(il);
-
-  //const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+    q = ggml_permute(ctx0, q, 0, 2, 1, 3);
+    k = ggml_permute(ctx0, k, 0, 2, 1, 3);
+    v = ggml_permute(ctx0, v, 0, 2, 1, 3);

    const auto n_tokens = q->ne[1];
    const auto n_head   = q->ne[2];
@@ -1336,17 +1208,11 @@ ggml_tensor * llm_graph_context::build_attn(

    const auto & kq_mask = inp->get_kq_mask();

-    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-
-    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-    //cb(k, "k", il);
-
-    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
-    //cb(k, "v", il);
-
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = k_cur;
+    ggml_tensor * v = v_cur;

+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@@ -1369,22 +1235,17 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()

    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);

-    const auto n_kv = kv_self->n;
+    {
+        GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA");
+        GGML_ASSERT(hparams.n_swa == 0         && "Use llama_kv_cache_unified_iswa for SWA");

-    inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    //cb(inp->self_kq_mask, "KQ_mask", -1);
-    ggml_set_input(inp->self_kq_mask);
+        const auto n_kv = kv_self->get_n();

-    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask, "KQ_mask", -1);
+        ggml_set_input(inp->self_kq_mask);

-    if (hparams.n_swa_pattern > 1) {
-        GGML_ASSERT(hparams.n_swa > 0);
-
-        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(inp->self_kq_mask_swa);
-
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
    }

    return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
@@ -1409,87 +1270,110 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_build_forward_expand(gf, v_cur);

    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
-    const auto & n_ctx = cparams.n_ctx;
-
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-    const auto n_tokens = q_cur->ne[2];
-
-    const bool v_trans = !cparams.flash_attn;

    // store to KV cache
    {
-        const auto kv_head = kv_self->head;
-
-        GGML_ASSERT(kv_self->size == n_ctx);
-
-        ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
-        //cb(k_cache_view, "k_cache_view", il);
-
-        // note: storing RoPE-ed version of K in the KV cache
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
-
-        v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
-
-        ggml_tensor * v_cache_view = nullptr;
-
-        if (!v_trans) {
-            v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
-        } else {
-            // note: the V cache is transposed when not using flash attention
-            v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
-                    (  n_ctx)*ggml_element_size(kv_self->v_l[il]),
-                    (kv_head)*ggml_element_size(kv_self->v_l[il]));
-
-            v_cur = ggml_transpose(ctx0, v_cur);
-        }
-        //cb(v_cache_view, "v_cache_view", il);
-
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
+        ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
    }

-    const bool is_swa = hparams.is_swa(il);
+    const auto & kq_mask = inp->get_kq_mask();

-    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = kv_self->get_k(ctx0, il);
+    ggml_tensor * v = kv_self->get_v(ctx0, il);

-    const auto n_kv = kv_self->n;
-
-    const int64_t n_head_kv = hparams.n_head_kv(il);
-
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-    const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-
-    ggml_tensor * k =
-        ggml_view_3d(ctx0, kv_self->k_l[il],
-                n_embd_head_k, n_kv, n_head_kv,
-                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
-                0);
-    //cb(k, "k", il);
-
-    ggml_tensor * v = !v_trans ?
-        ggml_view_3d(ctx0, kv_self->v_l[il],
-                n_embd_head_v, n_kv, n_head_kv,
-                ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
-                0) :
-        ggml_view_3d(ctx0, kv_self->v_l[il],
-                n_kv, n_embd_head_v, n_head_kv,
-                ggml_element_size(kv_self->v_l[il])*n_ctx,
-                ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
-                0);
-
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
        cur = build_lora_mm(wo, cur);
    }

+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
+llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
+    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
+
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
+
+    {
+        const auto n_kv = kv_self->get_kv_base()->get_n();
+
+        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask, "KQ_mask", -1);
+        ggml_set_input(inp->self_kq_mask);
+
+        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    }
+
+    if (hparams.n_swa_pattern > 1) {
+        GGML_ASSERT(hparams.n_swa > 0          && "Use llama_kv_cache_unified for non-SWA");
+
+        const auto n_kv = kv_self->get_kv_swa()->get_n();
+
+        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp->self_kq_mask_swa);
+
+        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+    }
+
+    return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_attn_kv_unified_iswa * inp,
+        ggml_cgraph * gf,
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_b,
+        ggml_tensor * v_mla,
+            float     kq_scale,
+            int       il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, k_cur);
+    ggml_build_forward_expand(gf, v_cur);
+
+    const bool is_swa = hparams.is_swa(il);
+
+    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
+
+    const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
+
+    // store to KV cache
+    {
+        ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
+    }
+
+    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = kv->get_k(ctx0, il);
+    ggml_tensor * v = kv->get_v(ctx0, il);
+
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    cb(cur, "kqv_out", il);
+
+    if (wo) {
+        cur = build_lora_mm(wo, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
+    }
+
    if (wo_b) {
        //cb(cur, "kqv_wo", il);
    }
@@ -1534,17 +1418,11 @@ ggml_tensor * llm_graph_context::build_attn(

    const auto & kq_mask = inp->get_kq_mask_cross();

-    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-
-    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-    //cb(k, "k", il);
-
-    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
-    //cb(k, "v", il);
-
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = k_cur;
+    ggml_tensor * v = v_cur;

+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@@ -1712,3 +1590,30 @@ void llm_graph_context::build_pooling(

    ggml_build_forward_expand(gf, cur);
 }
+
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+    // TODO move to hparams if a T5 variant appears that uses a different value
+    const int64_t max_distance = 128;
+
+    if (bidirectional) {
+        n_buckets >>= 1;
+    }
+
+    const int64_t max_exact = n_buckets >> 1;
+
+    int32_t relative_position = x - y;
+    int32_t relative_bucket = 0;
+
+    if (bidirectional) {
+        relative_bucket += (relative_position > 0) * n_buckets;
+        relative_position = abs(relative_position);
+    } else {
+        relative_position = -std::min<int32_t>(relative_position, 0);
+    }
+
+    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+
+    return relative_bucket;
+}
@@ -19,6 +19,7 @@ struct llama_cparams;

 class llama_memory_i;
 class llama_kv_cache_unified;
+class llama_kv_cache_unified_iswa;
 class llama_kv_cache_recurrent;

 // certain models (typically multi-modal) can produce different types of graphs
@@ -255,6 +256,31 @@ public:

    void set_input(const llama_ubatch * ubatch) override;

+    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_kv_unified_iswa(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_unified_iswa * kv_self) :
+        hparams(hparams),
+        cparams(cparams),
+        kv_self(kv_self) {
+    }
+    ~llm_graph_input_attn_kv_unified_iswa() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }

@@ -266,7 +292,7 @@ public:
    const llama_hparams & hparams;
    const llama_cparams & cparams;

-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_iswa * kv_self;
 };

 class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -378,7 +404,6 @@ struct llm_graph_context {
    const int64_t n_layer;
    const int64_t n_rot;
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_ctx_per_seq;
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head_k;
@@ -507,13 +532,12 @@ struct llm_graph_context {

    ggml_tensor * build_attn_mha(
             ggml_cgraph * gf,
-             ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
-             ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
-             ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
+             ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
+             ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
             ggml_tensor * kq_b,
             ggml_tensor * kq_mask,
-             ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
-                    bool   v_trans,
+             ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale) const;

    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@@ -546,6 +570,21 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;

+    llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_kv_unified_iswa * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
+
    llm_graph_input_attn_cross * build_attn_inp_cross() const;

    ggml_tensor * build_attn(
@@ -596,3 +635,6 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
 };
+
+// TODO: better name
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
@@ -14,6 +14,12 @@ enum llama_expert_gating_func_type {
    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
 };

+enum llama_swa_type {
+    LLAMA_SWA_TYPE_NONE     = 0,
+    LLAMA_SWA_TYPE_STANDARD = 1,
+    LLAMA_SWA_TYPE_CHUNKED  = 2,
+};
+
 struct llama_hparams_posnet {
    uint32_t n_embd;
    uint32_t n_layer;
@@ -35,8 +41,6 @@ struct llama_hparams {
    uint32_t n_embd_features = 0;
    uint32_t n_layer;
    uint32_t n_rot;
-    uint32_t n_swa = 0; // sliding window attention (SWA)
-    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
    uint32_t n_expert = 0;
@@ -96,6 +100,12 @@ struct llama_hparams {

    std::array<int, 4> rope_sections;

+    // Sliding Window Attention (SWA)
+    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
+    uint32_t n_swa = 0;         // the size of the sliding window (0 - no SWA)
+    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
+
    // for State Space Models
    uint32_t ssm_d_conv  = 0;
    uint32_t ssm_d_inner = 0;
@@ -116,11 +126,10 @@ struct llama_hparams {
    bool causal_attn   = true;
    bool use_alibi     = false;
    bool attn_soft_cap = false;
+    bool use_kq_norm   = true;

+    // llama4
    uint32_t n_moe_layer_step        = 0;
-    bool     use_kq_norm             = true;
-    uint32_t n_attn_chunk            = 0;
-    // values below seems to be fixed on llama4
    uint32_t n_no_rope_layer_step    = 4;
    uint32_t n_attn_temp_floor_scale = 8192;
    float    f_attn_temp_scale       = 0.1;
@@ -8,6 +8,7 @@
 #include "ggml-cpp.h"

 #include <set>
+#include <unordered_map>
 #include <vector>

 struct llama_cparams;
@@ -40,6 +41,9 @@ struct llama_kv_cache : public llama_memory_i {
    // batch processing
    //

+    // =============================================================================================================
+    // TODO: refactor  and simplify this
+
    virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;

    // different KV caches require different batch splitting strategies
@@ -48,6 +52,8 @@ struct llama_kv_cache : public llama_memory_i {
    // find an empty slot of size "n_tokens" in the cache
    virtual bool find_slot(const llama_ubatch & batch) = 0;

+    // =============================================================================================================
+
    // getters
    virtual int32_t   get_n_tokens()   const = 0;
    virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
@@ -87,38 +93,24 @@ private:
 // llama_kv_cache_unified
 //

-// TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
-    struct kv_cell {
-        llama_pos pos   = -1;
-        llama_pos delta =  0;
-
-        std::set<llama_seq_id> seq_id;
-
-        bool has_seq_id(const llama_seq_id & id) const {
-            return seq_id.find(id) != seq_id.end();
-        }
-
-        bool is_empty() const {
-            return seq_id.empty();
-        }
-
-        bool is_same_seq(const kv_cell & other) const {
-            return seq_id == other.seq_id;
-        }
-    };
-
    static uint32_t get_padding(const llama_cparams & cparams);

+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+
    llama_kv_cache_unified(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   v_trans,
-                         bool   offload,
-                     uint32_t   kv_size,
-                     uint32_t   padding);
+            const llama_model &  model,
+              layer_filter_cb && filter,
+                    ggml_type    type_k,
+                    ggml_type    type_v,
+                         bool    v_trans,
+                         bool    offload,
+                     uint32_t    kv_size,
+                     uint32_t    padding,
+                     uint32_t    n_swa,
+               llama_swa_type    swa_type);

    ~llama_kv_cache_unified() = default;

@@ -130,10 +122,11 @@ public:

    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;

+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

    //
@@ -150,7 +143,6 @@ public:
    void set_full() override;

    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-
    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;

    // updates the cache head
@@ -169,29 +161,72 @@ public:
    // state write/load

    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;

-    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
-    uint32_t size = 0; // total number of cells, shared across all sequences
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+    //
+    // llama_kv_cache_unified specific API
+    //

-    // computed before each graph build
-    uint32_t n = 0;
+    uint32_t get_n() const;
+    uint32_t get_size() const;

-    std::vector<kv_cell> cells;
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;

-    std::vector<ggml_tensor *> k_l; // per layer
-    std::vector<ggml_tensor *> v_l;
+    // store k_cur and v_cur in the cache based on the current head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
+
+    void prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax);
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+    void set_input_k_shift   (ggml_tensor * dst) const;
+    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;

 private:
    const llama_model & model;
    const llama_hparams & hparams;

+    struct kv_cell {
+        llama_pos pos   = -1;
+        llama_pos delta =  0;
+
+        // TODO: replace with bitset uint64_t
+        std::set<llama_seq_id> seq_id;
+
+        bool has_seq_id(const llama_seq_id & id) const {
+            return seq_id.find(id) != seq_id.end();
+        }
+
+        bool is_empty() const {
+            return seq_id.empty();
+        }
+
+        bool is_same_seq(const kv_cell & other) const {
+            return seq_id == other.seq_id;
+        }
+    };
+
+    struct kv_layer {
+        // layer index in the model
+        // note: can be different from the layer index in the KV cache
+        uint32_t il;
+
+        ggml_tensor * k;
+        ggml_tensor * v;
+    };
+
    bool has_shift = false;
    bool do_defrag = false;
-
    bool v_trans   = true;  // the value tensor is transposed
-    bool can_shift = false;
+
+    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+    uint32_t size = 0; // total number of cells, shared across all sequences
+    uint32_t used = 0; // used cells (i.e. at least one seq_id) (TODO: add `struct kv_cells` and keep track automaticallt)
+
+    // computed before each graph build
+    uint32_t n = 0;

    // required padding
    uint32_t padding = 1;
@@ -199,9 +234,29 @@ private:
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;

+    // SWA
+    uint32_t n_swa = 0;
+
+    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;

+    std::vector<kv_cell>  cells;  // TODO: replace with `struct kv_cells`
+    std::vector<kv_layer> layers;
+
+    // model layer id -> KV cache layer id
+    std::unordered_map<int32_t, int32_t> map_layer_ids;
+
+    // recovery information used to restore the KV cells to their original state in case of a failure
+    struct {
+        void clear() {
+            cells.clear();
+        }
+
+        std::unordered_map<uint32_t, kv_cell> cells;
+    } recovery;
+
    // defrag
    struct {
        std::vector<uint32_t> ids;
@@ -210,17 +265,6 @@ private:
    // return true if cells have been moved
    bool defrag_prepare(int32_t n_max_nodes);

-    // commit/restore cache
-    struct slot_range {
-        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
-        uint32_t c1 = 0;
-    };
-
-    // pending cell updates that are not yet committed
-    struct {
-        std::vector<slot_range> ranges;
-    } pending;
-
    // find how many cells are currently in use
    uint32_t cell_max() const;

@@ -229,6 +273,8 @@ private:
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;

+    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
+
    ggml_tensor * build_rope_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
@@ -255,6 +301,106 @@ private:
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };

+//
+// llama_kv_cache_unified_iswa
+//
+
+// utilizes two instances of llama_kv_cache_unified
+//   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
+//   upon successful commit, the SWA cache removes old tokens outside the n_swa window
+
+class llama_kv_cache_unified_iswa : public llama_kv_cache {
+public:
+    llama_kv_cache_unified_iswa(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                     uint32_t   kv_size,
+                         bool   swa_full,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_batch,
+                     uint32_t   padding);
+
+    ~llama_kv_cache_unified_iswa() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    void clear() override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    //
+    // llama_kv_cache
+    //
+
+    void restore() override;
+    void commit()  override;
+
+    bool update(llama_context & ctx) override;
+
+    void defrag_sched(float thold) override;
+
+    void set_full() override;
+
+    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+
+    bool find_slot(const llama_ubatch & batch) override;
+
+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos get_pos_max() const override;
+
+    bool get_can_shift() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_kv_cache_unified_iswa specific API
+    //
+
+    llama_kv_cache_unified * get_kv_base() const;
+    llama_kv_cache_unified * get_kv_swa () const;
+
+private:
+    const llama_hparams & hparams;
+
+    bool do_prune = true;
+
+    struct {
+        struct entry {
+            llama_pos pmin;
+            llama_pos pmax;
+        };
+
+        void clear() {
+            pos.clear();
+        }
+
+        // used to perform SWA pruning of old tokens
+        std::unordered_map<llama_seq_id, entry> pos;
+    } pending;
+
+    std::unique_ptr<llama_kv_cache_unified> kv_base;
+    std::unique_ptr<llama_kv_cache_unified> kv_swa;
+};
+
 //
 // llama_kv_cache_recurrent
 //
@@ -302,6 +448,7 @@ public:
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;

+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

    //
@@ -318,7 +465,6 @@ public:
    void set_full() override;

    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-
    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;

    bool find_slot(const llama_ubatch & batch) override;
@@ -388,12 +534,3 @@ private:
    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
-
-
-//
-// kv cache view
-//
-
-llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);
-
-void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
@@ -7,8 +7,8 @@ struct llama_memory_params {
    ggml_type type_k;
    ggml_type type_v;

-    // parameters for other types of memory
-    // ...
+    // use full-size SWA cache
+    bool swa_full;
 };

 // general concept of LLM memory
@@ -25,6 +25,7 @@ public:
    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;

+    virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
    virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;

    virtual bool get_can_edit() const = 0;
@@ -571,9 +571,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
+
+                hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
+                hparams.n_swa         = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
                hparams.n_swa_pattern = 4;    // pattern: 3 chunked - 1 full
-                hparams.n_attn_chunk  = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
-                hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later

                switch (hparams.n_expert) {
                    case 16:  type = LLM_TYPE_17B_16E; break;
@@ -855,20 +856,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
                if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
                    // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
+                    LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
+
+                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
                    hparams.n_swa = 2047;
                } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
                    // default value for Phi-3-mini-128k-instruct
-                    // note: this seems incorrect because the window is bigger than the train context?
-                    hparams.n_swa = 262144;
+                    LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__);
+
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+                    hparams.n_swa         = hparams.n_ctx_train;
+                    hparams.n_swa_pattern = 1;
                } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
                    // default value for Phi-3-medium-128k-instruct
-                    // note: this seems incorrect because the window is equal to the train context?
-                    hparams.n_swa = 131072;
+                    LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__);
+
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+                    hparams.n_swa         = hparams.n_ctx_train;
+                    hparams.n_swa_pattern = 1;
                }
+
                bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
                if (!found_swa && hparams.n_swa == 0) {
                    throw std::runtime_error("invalid value for sliding_window");
                }
+
+                if (hparams.n_swa > hparams.n_ctx_train) {
+                    LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train);
+
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+                    hparams.n_swa         = hparams.n_ctx_train;
+                    hparams.n_swa_pattern = 1;
+                }
            } break;
        case LLM_ARCH_PHIMOE:
            {
@@ -937,6 +960,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            } break;
        case LLM_ARCH_GEMMA2:
            {
+                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.n_swa = 4096; // default value of gemma 2
                hparams.n_swa_pattern = 2;
                hparams.attn_soft_cap = true;
@@ -955,6 +979,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            } break;
        case LLM_ARCH_GEMMA3:
            {
+                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.n_swa_pattern = 6;

                hparams.rope_freq_base_train_swa  = 10000.0f;
@@ -1039,6 +1064,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            } break;
        case LLM_ARCH_COHERE2:
            {
+                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.n_swa_pattern = 4;

                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
@@ -4489,7 +4515,17 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
    return it->second;
 }

-ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
+float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
+    return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
+}
+
+float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
+    return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+}
+
+ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
    // choose long/short freq factors based on the context size
    if (layers[il].rope_freqs != nullptr) {
        return layers[il].rope_freqs;
@@ -4517,22 +4553,13 @@ struct llm_build_llama : public llm_graph_context {
        // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();

-        // temperature tuning
-        ggml_tensor * inp_attn_scale = nullptr;
-        if (arch == LLM_ARCH_LLAMA4) {
-            inp_attn_scale = build_inp_attn_scale();
-        }
-
        auto * inp_attn = build_attn_inp_kv_unified();

        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
        for (int il = 0; il < n_layer; ++il) {
            ggml_tensor * inpSA = inpL;

-            bool use_rope = arch == LLM_ARCH_LLAMA4
-                ? (il + 1) % hparams.n_no_rope_layer_step != 0
-                : true;
-
            // norm
            cur = build_norm(inpL,
                    model.layers[il].attn_norm, NULL,
@@ -4542,7 +4569,169 @@ struct llm_build_llama : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network (non-MoE)
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, true,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        il);
+                cb(cur, "ffn_moe_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
+struct llm_build_llama_iswa : public llm_graph_context {
+    llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        // temperature tuning
+        ggml_tensor * inp_attn_scale = nullptr;
+        inp_attn_scale = build_inp_attn_scale();
+
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4590,7 +4779,7 @@ struct llm_build_llama : public llm_graph_context {
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);

-                if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
+                if (use_rope && hparams.use_kq_norm) {
                    // Llama4TextL2Norm
                    Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
                    Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
@@ -4616,7 +4805,6 @@ struct llm_build_llama : public llm_graph_context {

            // feed-forward network (non-MoE)
            if (model.layers[il].ffn_gate_inp == nullptr) {
-
                cur = build_norm(ffn_inp,
                        model.layers[il].ffn_norm, NULL,
                        LLM_NORM_RMS, il);
@@ -4629,9 +4817,7 @@ struct llm_build_llama : public llm_graph_context {
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
                cb(cur, "ffn_out", il);
-
-            } else if (arch == LLM_ARCH_LLAMA4) {
-                // llama4 MoE
+            } else {
                ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
                        model.layers[il].ffn_norm, NULL,
                        LLM_NORM_RMS, il);
@@ -4660,26 +4846,6 @@ struct llm_build_llama : public llm_graph_context {

                cur = ggml_add(ctx0, moe_out, shexp_out);
                cb(cur, "ffn_moe_out_merged", il);
-
-            } else {
-                // MoE branch
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_moe_ffn(cur,
-                        model.layers[il].ffn_gate_inp,
-                        model.layers[il].ffn_up_exps,
-                        model.layers[il].ffn_gate_exps,
-                        model.layers[il].ffn_down_exps,
-                        nullptr,
-                        n_expert, n_expert_used,
-                        LLM_FFN_SILU, true,
-                        false, 0.0,
-                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                        il);
-                cb(cur, "ffn_moe_out", il);
            }

            cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4753,7 +4919,7 @@ struct llm_build_deci : public llm_graph_context {
            } else if (n_head > 0) {
                // self-attention
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -7202,8 +7368,8 @@ struct llm_build_phi2 : public llm_graph_context {
    }
 };

-struct llm_build_phi3 : public llm_graph_context {
-    llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_phi3_iswa : public llm_graph_context {
+    llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

@@ -7217,7 +7383,7 @@ struct llm_build_phi3 : public llm_graph_context {
        // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();

-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();

        for (int il = 0; il < n_layer; ++il) {
            auto * residual = inpL;
@@ -7225,7 +7391,7 @@ struct llm_build_phi3 : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for 128k context
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

                ggml_tensor* attn_norm_output = build_norm(inpL,
                        model.layers[il].attn_norm,
@@ -7977,7 +8143,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
        for (int il = 0; il < n_layer; ++il) {
            ggml_tensor * inpSA = inpL;

-            ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

            // norm
            cur = build_norm(inpL,
@@ -8277,8 +8443,8 @@ struct llm_build_gemma : public llm_graph_context {
    }
 };

-struct llm_build_gemma2 : public llm_graph_context {
-    llm_build_gemma2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_gemma2_iswa : public llm_graph_context {
+    llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_k;

        ggml_tensor * cur;
@@ -8292,7 +8458,7 @@ struct llm_build_gemma2 : public llm_graph_context {
        // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();

-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();

        for (int il = 0; il < n_layer; ++il) {
            // norm
@@ -8414,8 +8580,8 @@ struct llm_build_gemma2 : public llm_graph_context {
    }
 };

-struct llm_build_gemma3 : public llm_graph_context {
-    llm_build_gemma3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_gemma3_iswa : public llm_graph_context {
+    llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_k;

        ggml_tensor * cur;
@@ -8433,13 +8599,11 @@ struct llm_build_gemma3 : public llm_graph_context {
        ggml_tensor * inp_pos = build_inp_pos();

        // TODO: is causal == true correct? might need some changes
-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();

        for (int il = 0; il < n_layer; ++il) {
-            const bool is_swa = hparams.is_swa(il);
-
-            const float freq_base_l  = is_swa ? hparams.rope_freq_base_train_swa  : cparams.rope_freq_base;
-            const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+            const float freq_base_l  = model.get_rope_freq_base (cparams, il);
+            const float freq_scale_l = model.get_rope_freq_scale(cparams, il);

            // norm
            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -9016,8 +9180,8 @@ struct llm_build_command_r : public llm_graph_context {
    }
 };

-struct llm_build_cohere2 : public llm_graph_context {
-    llm_build_cohere2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_cohere2_iswa : public llm_graph_context {
+    llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;

        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9032,7 +9196,7 @@ struct llm_build_cohere2 : public llm_graph_context {
        // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();

-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();

        for (int il = 0; il < n_layer; ++il) {
            const bool is_swa = hparams.is_swa(il);
@@ -9045,7 +9209,7 @@ struct llm_build_cohere2 : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for 128k context
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9983,7 +10147,7 @@ struct llm_build_deepseek : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11347,7 +11511,7 @@ struct llm_build_exaone : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12263,7 +12427,7 @@ struct llm_build_granite : public llm_graph_context {
                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

                if (use_rope) {
-                    ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                    ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
                    Qcur = ggml_rope_ext(
                            ctx0, Qcur, inp_pos, rope_factors,
                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -12916,7 +13080,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13068,14 +13232,31 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,

                LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);

-                res = new llama_kv_cache_unified(
-                        *this,
-                        params.type_k,
-                        params.type_v,
-                        !cparams.flash_attn,
-                        cparams.offload_kqv,
-                        cparams.n_ctx,
-                        padding);
+                if (hparams.n_swa > 0) {
+                    res = new llama_kv_cache_unified_iswa(
+                            *this,
+                            params.type_k,
+                            params.type_v,
+                            !cparams.flash_attn,
+                            cparams.offload_kqv,
+                            cparams.n_ctx,
+                            params.swa_full,
+                            cparams.n_seq_max,
+                            cparams.n_batch,
+                            padding);
+                } else {
+                    res = new llama_kv_cache_unified(
+                            *this,
+                            nullptr,
+                            params.type_k,
+                            params.type_v,
+                            !cparams.flash_attn,
+                            cparams.offload_kqv,
+                            cparams.n_ctx,
+                            padding,
+                            hparams.n_swa,
+                            hparams.swa_type);
+                }
            }
    }

@@ -13090,11 +13271,14 @@ llm_graph_result_ptr llama_model::build_graph(

    switch (arch) {
        case LLM_ARCH_LLAMA:
-        case LLM_ARCH_LLAMA4:
        case LLM_ARCH_MINICPM:
            {
                llm = std::make_unique<llm_build_llama>(*this, params, gf);
            } break;
+        case LLM_ARCH_LLAMA4:
+            {
+                llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
+            } break;
        case LLM_ARCH_DECI:
            {
                llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13169,7 +13353,7 @@ llm_graph_result_ptr llama_model::build_graph(
        case LLM_ARCH_PHI3:
        case LLM_ARCH_PHIMOE:
            {
-                llm = std::make_unique<llm_build_phi3>(*this, params, gf);
+                llm = std::make_unique<llm_build_phi3_iswa>(*this, params, gf);
            } break;
        case LLM_ARCH_PLAMO:
            {
@@ -13201,11 +13385,11 @@ llm_graph_result_ptr llama_model::build_graph(
            } break;
        case LLM_ARCH_GEMMA2:
            {
-                llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
+                llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
            } break;
        case LLM_ARCH_GEMMA3:
            {
-                llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
+                llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
            } break;
        case LLM_ARCH_STARCODER2:
            {
@@ -13225,7 +13409,7 @@ llm_graph_result_ptr llama_model::build_graph(
            } break;
        case LLM_ARCH_COHERE2:
            {
-                llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
+                llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
            } break;
        case LLM_ARCH_DBRX:
            {
@@ -398,7 +398,10 @@ struct llama_model {

    const struct ggml_tensor * get_tensor(const char * name) const;

-    ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
+    float get_rope_freq_base (const llama_cparams & cparams, int il) const;
+    float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
+
+    ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;

    // note: can mutate `cparams`
    // TODO: move this to new llm_arch_model_i interface
@@ -128,7 +128,7 @@ int main(void) {

    if (common_has_curl()) {
        printf("test-arg-parser: test curl-related functions\n\n");
-        const char * GOOD_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/README.md";
+        const char * GOOD_URL = "https://ggml.ai/";
        const char * BAD_URL  = "https://www.google.com/404";
        const char * BIG_FILE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin";

@@ -80,10 +80,6 @@ Using the `-d <n>` option, each test can be run at a specified context depth, pr

 For a description of the other options, see the [main example](../main/README.md).

-Note:
-
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
-
 ## Examples

 ### Text generation with different models
@@ -991,6 +991,7 @@ struct cmd_params_instance {
        cparams.flash_attn   = flash_attn;
        cparams.embeddings   = embeddings;
        cparams.op_offload   = !no_op_offload;
+        cparams.swa_full     = false;

        return cparams;
    }
@@ -4,6 +4,7 @@

 #include <climits>
 #include <cstdarg>
+#include <cinttypes>
 #include <string>
 #include <map>
 #include <sstream>
@@ -44,7 +45,7 @@
 // tensor name constants
 //

-#define TN_POS_EMBD        "%s.position_embd.weight"
+#define TN_POS_EMBD        "v.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
 #define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
@@ -110,6 +111,7 @@ enum projector_type {
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
    PROJECTOR_TYPE_INTERNVL,
+    PROJECTOR_TYPE_LLAMA4,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -125,6 +127,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
+    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -240,6 +243,11 @@ struct clip_image_u8_batch {
 struct clip_image_f32_batch {
    std::vector<clip_image_f32_ptr> entries;

+    // for llava-uhd style models, we need to know the grid size
+    // note: entries.size() == grid_x * grid_y + 1 (one overview image)
+    int grid_x = 0;
+    int grid_y = 0;
+
    clip_image_f32_batch clone() const {
        clip_image_f32_batch new_batch;
        new_batch.entries.reserve(entries.size());
@@ -358,6 +366,70 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
    }
 }

+//
+// debugging
+//
+
+static void print_tensor_shape(ggml_tensor * t) {
+    printf("%s.shape = [", t->name);
+    for (int i = 0; i < ggml_n_dims(t); ++i) {
+        printf("%" PRId64, t->ne[i]);
+        if (i < ggml_n_dims(t) - 1) {
+            printf(", ");
+        }
+    }
+    printf("]\n");
+}
+
+static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
+    ggml_type type = t->type;
+    int64_t * ne = t->ne;
+    size_t * nb = t->nb;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        printf("%s.data: [\n", t->name);
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                printf("     ..., \n");
+                i2 = ne[2] - n;
+            }
+            printf("     [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    printf("      ..., \n");
+                    i1 = ne[1] - n;
+                }
+                printf("      [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        printf("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    float v;
+                    if (type == GGML_TYPE_F16) {
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                    } else if (type == GGML_TYPE_F32) {
+                        v = *(float *) &data[i];
+                    } else if (type == GGML_TYPE_I32) {
+                        v = (float) *(int32_t *) &data[i];
+                    } else if (type == GGML_TYPE_I16) {
+                        v = (float) *(int16_t *) &data[i];
+                    } else if (type == GGML_TYPE_I8) {
+                        v = (float) *(int8_t *) &data[i];
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                    printf("%8.4f", v);
+                    if (i0 < ne[0] - 1) printf(", ");
+                }
+                printf("],\n");
+            }
+            printf("     ],\n");
+        }
+        printf("    ]\n");
+    }
+}
+
 //
 // API used internally with mtmd
 //
@@ -359,9 +359,12 @@ struct clip_ctx {
    int max_nodes = 8192;
    ggml_backend_sched_ptr sched;

-    clip_image_size load_image_size;
+    // for debugging
+    bool debug_graph = false;
+    std::vector<ggml_tensor *> debug_print_tensors;

    clip_ctx(clip_context_params & ctx_params) {
+        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
        if (!backend_cpu) {
            throw std::runtime_error("failed to initialize CPU backend");
@@ -440,7 +443,7 @@ struct clip_graph {
        };
        ctx0_ptr.reset(ggml_init(params));
        ctx0 = ctx0_ptr.get();
-        gf = ggml_new_graph(ctx0);
+        gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
    }

    ggml_cgraph * build_siglip() {
@@ -522,7 +525,7 @@ struct clip_graph {
        ggml_set_input(pos_w);

        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-            return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta);
+            return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
        };

        ggml_tensor * inp = build_inp();
@@ -936,6 +939,101 @@ struct clip_graph {
        return gf;
    }

+    ggml_cgraph * build_llama4() {
+        GGML_ASSERT(model.class_embedding != nullptr);
+        GGML_ASSERT(model.position_embeddings != nullptr);
+
+        const int n_pos = n_patches + 1; // +1 for [CLS]
+
+        // 2D input positions
+        ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(pos_h, "pos_h");
+        ggml_set_input(pos_h);
+
+        ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(pos_w, "pos_w");
+        ggml_set_input(pos_w);
+
+        ggml_tensor * inp = build_inp_raw();
+
+        // Llama4UnfoldConvolution
+        {
+            ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
+                                                    patch_size, patch_size, 3, n_embd);
+            inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
+            inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+            inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+            cb(inp, "patch_conv", -1);
+        }
+
+        // add CLS token
+        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+        // build ViT with 2D position embeddings
+        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+            // first half is X axis and second half is Y axis
+            // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
+            // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
+            return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+        };
+        ggml_tensor * cur = build_vit(
+                                inp, n_pos,
+                                NORM_TYPE_NORMAL,
+                                hparams.ffn_op,
+                                model.position_embeddings,
+                                add_pos);
+
+        // remove CLS token
+        cur = ggml_view_2d(ctx0, cur,
+            n_embd, n_patches,
+            ggml_row_size(cur->type, n_embd), 0);
+
+        // pixel shuffle
+        // based on Llama4VisionPixelShuffleMLP
+        // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
+        {
+            const int scale_factor = model.hparams.proj_scale_factor;
+            const int bsz = 1; // batch size, always 1 for now since we don't support batching
+            GGML_ASSERT(scale_factor > 0);
+            GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
+            cur = ggml_reshape_4d(ctx0, cur,
+                n_embd * scale_factor,
+                n_patches_x / scale_factor,
+                n_patches_y,
+                bsz);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
+                n_embd * scale_factor * scale_factor,
+                n_patches_x / scale_factor,
+                n_patches_y / scale_factor,
+                bsz);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            // flatten to 2D
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
+                n_embd * scale_factor * scale_factor,
+                n_patches / scale_factor / scale_factor);
+            cb(cur, "pixel_shuffle", -1);
+        }
+
+        // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
+        {
+            cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
+            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
+            cur = ggml_gelu(ctx0, cur);
+            cb(cur, "adapter_mlp", -1);
+        }
+
+        // Llama4MultiModalProjector
+        cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+        cb(cur, "projected", -1);
+
+        // build the graph
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
    // this graph is used by llava, granite and glm
    // due to having embedding_stack (used by granite), we cannot reuse build_vit
    ggml_cgraph * build_llava() {
@@ -1315,11 +1413,15 @@ private:
    // utility functions
    //

-    void cb(ggml_tensor * cur, const char * name, int il) const {
-        // TODO: implement this
-        GGML_UNUSED(cur);
-        GGML_UNUSED(name);
-        GGML_UNUSED(il);
+    void cb(ggml_tensor * cur0, const char * name, int il) const {
+        if (ctx->debug_graph) {
+            ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
+            std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
+            ggml_set_name(cur, cur_name.c_str());
+            ggml_set_output(cur);
+            ggml_build_forward_expand(gf, cur);
+            ctx->debug_print_tensors.push_back(cur);
+        }
    }

    // build vision transformer (ViT) cgraph
@@ -1630,9 +1732,10 @@ private:
    static ggml_tensor * build_rope_2d(
        ggml_context * ctx0,
        ggml_tensor * cur,
-        ggml_tensor * pos_h,
-        ggml_tensor * pos_w,
-        const float freq_base
+        ggml_tensor * pos_a, // first half
+        ggml_tensor * pos_b, // second half
+        const float freq_base,
+        const bool interleave_freq
    ) {
        const int64_t n_dim  = cur->ne[0];
        const int64_t n_head = cur->ne[1];
@@ -1646,7 +1749,9 @@ private:
        //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
        // then for the second half, we use freq_scale to shift the inv_freq
        //  ^ why? replace (2i) with (2i+1) in the above equation
-        const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
+        const float freq_scale_odd = interleave_freq
+                                    ? std::pow(freq_base, (float)-2/n_dim)
+                                    : 1.0;

        // first half
        ggml_tensor * first;
@@ -1659,7 +1764,7 @@ private:
            first = ggml_rope_ext(
                ctx0,
                first,
-                pos_h,      // positions
+                pos_a,      // positions
                nullptr,    // freq factors
                n_dim/2,    // n_dims
                0, 0, freq_base,
@@ -1679,7 +1784,7 @@ private:
            second = ggml_rope_ext(
                ctx0,
                second,
-                pos_w,      // positions
+                pos_b,      // positions
                nullptr,    // freq factors
                n_dim/2,    // n_dims
                0, 0, freq_base,
@@ -1723,6 +1828,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                res = graph.build_internvl();
            } break;
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                res = graph.build_llama4();
+            } break;
        default:
            {
                res = graph.build_llava();
@@ -1926,6 +2035,21 @@ struct clip_model_loader {
                        hparams.warmup_image_size = hparams.patch_size * 8;
                        get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
                    } break;
+                case PROJECTOR_TYPE_LLAMA4:
+                    {
+                        hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
+
+                        // borrowed from llava-1.6
+                        const int isize = hparams.image_size;
+                        hparams.image_grid_pinpoints = {
+                            isize,   isize*2, // 336, 672
+                            isize*2, isize,   // 672, 336
+                            isize*2, isize*2, // 672, 672
+                            isize*3, isize,   // 1008, 336
+                            isize,   isize*3, // 336, 1008
+                        };
+                    } break;
                default:
                    break;
            }
@@ -1946,6 +2070,10 @@ struct clip_model_loader {
            LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
            LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
            LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
+
+            if (ctx_clip.proj_type == PROJECTOR_TYPE_LLAMA4) {
+                LOG_WRN("%s: llama 4 vision is known to have degraded quality: https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+            }
        }
    }

@@ -2001,7 +2129,7 @@ struct clip_model_loader {
        vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
        vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);

-        vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
+        vision_model.position_embeddings = get_tensor(TN_POS_EMBD, false);

        // layers
        vision_model.layers.resize(hparams.n_layer);
@@ -2182,6 +2310,12 @@ struct clip_model_loader {
                    vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
                    vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
                } break;
+            case PROJECTOR_TYPE_LLAMA4:
+                {
+                    vision_model.mm_model_proj    = get_tensor(TN_MM_PROJECTOR);
+                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
+                } break;
            default:
                GGML_ASSERT(false && "unknown projector type");
        }
@@ -2328,14 +2462,6 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
    return ctx_clip;
 }

-void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
-    ctx_clip->load_image_size = *load_image_size; // copy
-}
-
-struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
-    return &ctx_clip->load_image_size;
-}
-
 struct clip_image_size * clip_image_size_init() {
    struct clip_image_size * load_image_size = new struct clip_image_size();
    load_image_size->width = 448;
@@ -2849,7 +2975,7 @@ private:

    // used by llava 1.6 with custom list of pinpoints
    static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
-        std::vector<clip_image_size> possible_resolutions;
+        std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
        for (size_t i = 0; i < pinpoints.size(); i += 2) {
            possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
        }
@@ -2916,12 +3042,6 @@ private:
    }
 };

-// TODO @ngxson : decprecate the load_image_size singleton pattern
-int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
-    const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
-    return inst.grid_size.width;
-}
-
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
@@ -2943,9 +3063,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
            normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
            res_imgs->entries.push_back(std::move(res));
        }
+
+        res_imgs->grid_x = inst.grid_size.width;
+        res_imgs->grid_y = inst.grid_size.height;
        return true;
-    }
-    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+
+    } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
        clip_image_u8 resized;
        auto patch_size = params.patch_size * 2;
        auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
@@ -2971,8 +3094,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
        normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
        res_imgs->entries.push_back(std::move(img_f32));
        return true;
-    }
-    else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
+
+    } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
        clip_image_u8 resized_image;
        auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
        image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
@@ -2980,6 +3103,22 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
        normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
        res_imgs->entries.push_back(std::move(img_f32));
        return true;
+
+    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
+        GGML_ASSERT(!params.image_grid_pinpoints.empty());
+        auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+        std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+        for (size_t i = 0; i < imgs.size(); ++i) {
+            clip_image_f32_ptr res(clip_image_f32_init());
+            normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
+            res_imgs->entries.push_back(std::move(res));
+        }
+
+        res_imgs->grid_x = inst.grid_size.width;
+        res_imgs->grid_y = inst.grid_size.height;
+        return true;
+
    }

    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
@@ -3098,6 +3237,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
    const auto & params = ctx->vision_model.hparams;

    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    int scale_factor = ctx->vision_model.hparams.proj_scale_factor;

    if (ctx->proj_type == PROJECTOR_TYPE_LDP
            || ctx->proj_type == PROJECTOR_TYPE_LDPV2
@@ -3136,6 +3276,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
        int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
        n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
+    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
+        n_patches /= (scale_factor * scale_factor);
    }

    return n_patches;
@@ -3247,6 +3389,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    // build the inference graph
+    ctx->debug_print_tensors.clear();
    ggml_backend_sched_reset(ctx->sched.get());
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3261,8 +3404,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int patch_size    = hparams.patch_size;
    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
-    const int pos_w = ctx->load_image_size.width  / patch_size;
-    const int pos_h = ctx->load_image_size.height / patch_size;
+    const int pos_w = image_size_width  / patch_size;
+    const int pos_h = image_size_height / patch_size;

    const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl

@@ -3528,6 +3671,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            {
                // do nothing
            } break;
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                // set the 2D positions
+                int n_patches_per_col = image_size_width / patch_size;
+                std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
+                // last pos is always kept 0, it's for CLS
+                // dimension H
+                for (int i = 0; i < num_patches; i++) {
+                    pos_data[i] = (i / n_patches_per_col) + 1;
+                }
+                set_input_i32("pos_h", pos_data);
+                // dimension W
+                for (int i = 0; i < num_patches; i++) {
+                    pos_data[i] = (i % n_patches_per_col) + 1;
+                }
+                set_input_i32("pos_w", pos_data);
+            } break;
        default:
            GGML_ABORT("Unknown projector type");
    }
@@ -3548,6 +3708,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return false;
    }

+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
    // the last node is the embedding tensor
    ggml_tensor * embeddings = ggml_graph_node(gf, -1);

@@ -3596,6 +3768,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
            return ctx->vision_model.projection->ne[1];
        case PROJECTOR_TYPE_INTERNVL:
            return ctx->vision_model.mm_3_w->ne[1];
+        case PROJECTOR_TYPE_LLAMA4:
+            return ctx->vision_model.mm_model_proj->ne[1];
        default:
            GGML_ABORT("Unknown projector type");
    }
@@ -47,10 +47,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
 // this should be equal to the embedding dimension of the text model
 int clip_n_mmproj_embd(const struct clip_ctx * ctx);

-int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
-void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
-struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
-
 struct clip_image_size      * clip_image_size_init(void);
 struct clip_image_u8        * clip_image_u8_init (void);
 struct clip_image_f32       * clip_image_f32_init(void);
@@ -231,12 +231,14 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
        while (i < n_tokens) { // split into batches
            text_batch.n_tokens = 0; // clear the batch
            for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
+                int32_t j = text_batch.n_tokens;
+                text_batch.token   [j]    = tokens[i];
+                text_batch.pos     [j]    = n_past++;
+                text_batch.n_seq_id[j]    = 1;
+                text_batch.seq_id  [j][0] = seq_id;
+                text_batch.logits  [j]    = false;
+
                text_batch.n_tokens++;
-                text_batch.token   [i]    = tokens[i];
-                text_batch.pos     [i]    = n_past++;
-                text_batch.n_seq_id[i]    = 1;
-                text_batch.seq_id  [i][0] = seq_id;
-                text_batch.logits  [i]    = false;
            }
            bool is_last_token = (i == n_tokens);
            if (logits_last && is_last_token) {
@@ -42,6 +42,7 @@ enum mtmd_slice_tmpl {
    MTMD_SLICE_TMPL_NONE,
    MTMD_SLICE_TMPL_MINICPMV_2_5,
    MTMD_SLICE_TMPL_MINICPMV_2_6,
+    MTMD_SLICE_TMPL_LLAMA4,
    // TODO @ngxson : add support for idefics (SmolVLM)
 };

@@ -64,15 +65,19 @@ struct mtmd_context {
    int n_threads;
    std::string image_marker;

-    // for minicpmv, we need special tokens in-between slices
+    // for llava-uhd style models, we need special tokens in-between slices
+    // minicpmv calls them "slices", llama 4 calls them "tiles"
    mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
    llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image
    llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
    llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
    llama_token tok_slices_end    = LLAMA_TOKEN_NULL; // end of all slices
-    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
-    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice
+    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
+    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice end
+    llama_token tok_sli_img_mid   = LLAMA_TOKEN_NULL; // between 2 slices
    llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
+    bool        tok_row_end_trail = false;
+    bool        ov_img_first      = false;

    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE

@@ -96,6 +101,7 @@ struct mtmd_context {

        use_mrope = clip_is_qwen2vl(ctx_clip);

+        projector_type proj = clip_get_projector_type(ctx_clip);
        int minicpmv_version = clip_is_minicpmv(ctx_clip);
        if (minicpmv_version == 2) {
            // minicpmv 2.5 format:
@@ -108,6 +114,8 @@ struct mtmd_context {
            tok_sli_img_start = tok_ov_img_start;
            tok_sli_img_end   = tok_ov_img_end;
            tok_row_end       = lookup_token("\n");
+            tok_row_end_trail = false; // no trailing end-of-row token
+            ov_img_first      = true;

        } else if (minicpmv_version == 3 || minicpmv_version == 4) {
            // minicpmv 2.6 format:
@@ -118,9 +126,25 @@ struct mtmd_context {
            tok_sli_img_start = lookup_token("<slice>");
            tok_sli_img_end   = lookup_token("</slice>");
            tok_row_end       = lookup_token("\n");
+            tok_row_end_trail = false; // no trailing end-of-row token
+            ov_img_first      = true;

        } else if (minicpmv_version != 0) {
            GGML_ASSERT(false && "unsupported minicpmv version");
+        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+            // llama 4 format:
+            // <|image_start|>
+            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+            //     ... <|tile_y_separator|>   <-- trailing end-of-row token
+            // <|image|> (overview)           <-- overview image is last
+            // <|image_end|>
+            slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
+            tok_ov_img_start  = lookup_token("<|image|>");
+            tok_sli_img_mid   = lookup_token("<|tile_x_separator|>");
+            tok_row_end       = lookup_token("<|tile_y_separator|>");
+            tok_row_end_trail = true; // add trailing end-of-row token
+            ov_img_first      = false; // overview image is last
        }
    }

@@ -243,16 +267,18 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
        // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
        marker_modified = ctx->image_marker + "[IMG_END]";
        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-    }

-    else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
        // <|vision_start|> ... (image embeddings) ... <|vision_end|>
        marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);

-    }
+    } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
+        // (more details in mtmd_context constructor)
+        marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);

-    else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
+    } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
        // <img> ... (image embeddings) ... </img>
        marker_modified = "<img>" + ctx->image_marker + "</img>";
        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
@@ -328,7 +354,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
            img_u8->ny = bitmaps[i_img]->ny;
            img_u8->buf.resize(bitmaps[i_img]->data.size());
            std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
-            clip_image_size img_u8_size{img_u8->nx, img_u8->ny};

            // preprocess image
            clip_image_f32_batch batch_f32;
@@ -338,28 +363,40 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                return 2;
            }

-            if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
+            // handle llava-uhd style preprocessing
+            if (
+                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+            ) {
                // split batch into chunks of single images
                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
                GGML_ASSERT(chunks.size() > 0);

-                // add overview image
-                add_text_chunk({ctx->tok_ov_img_start});
-                output->entries.emplace_back(std::move(chunks.front()));
+                auto ov_chunk = std::move(chunks.front());
                chunks.erase(chunks.begin());
-                add_text_chunk({ctx->tok_ov_img_end});

-                // add slices
+                // add overview image (first)
+                if (ctx->ov_img_first) {
+                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_ov_img_start});
+                    }
+                    output->entries.emplace_back(std::move(ov_chunk));
+                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_ov_img_end});
+                    }
+                }
+
+                // add slices (or tiles)
                if (!chunks.empty()) {
-                    clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
-                    int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
-                    int n_row = (int)chunks.size() / n_col;
-                    GGML_ASSERT(n_row * n_col == (int)chunks.size());
+                    const int n_col = batch_f32.grid_x;
+                    const int n_row = batch_f32.grid_y;
                    if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
                        add_text_chunk({ctx->tok_slices_start});
                    }
                    for (int y = 0; y < n_row; y++) {
                        for (int x = 0; x < n_col; x++) {
+                            const bool is_last_in_row = (x == n_col - 1);
                            if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
                                add_text_chunk({ctx->tok_sli_img_start});
                            }
@@ -367,8 +404,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                            if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
                                add_text_chunk({ctx->tok_sli_img_end});
                            }
+                            if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
+                                add_text_chunk({ctx->tok_sli_img_mid});
+                            }
                        }
-                        if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
+                        if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
                            add_text_chunk({ctx->tok_row_end});
                        }
                    }
@@ -377,6 +417,17 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                    }
                }

+                // add overview image (last)
+                if (!ctx->ov_img_first) {
+                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_ov_img_start});
+                    }
+                    output->entries.emplace_back(std::move(ov_chunk));
+                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_ov_img_end});
+                    }
+                }
+
            } else {
                size_t n_tokens = 0;
                for (const auto & entry : batch_f32.entries) {
@@ -427,14 +478,6 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
    bool ok = false;

-    // only effective for minicpmv and qwen2vl, other models will ignore load_image_size
-    {
-        clip_image_size slice_size{
-            image_tokens->batch_f32.entries[0]->nx,
-            image_tokens->batch_f32.entries[0]->ny};
-        clip_add_load_image_size(ctx->ctx_clip, &slice_size);
-    }
-
    if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
        const auto & entries = image_tokens->batch_f32.entries;
@@ -21,6 +21,13 @@ if [ "${1:-}" = "big" ]; then
    echo "Include BIG models..."
 fi

+RUN_HUGE_TESTS=false
+if [ "${1:-}" = "huge" ]; then
+    RUN_HUGE_TESTS=true
+    RUN_BIG_TESTS=true
+    echo "Include BIG models..."
+fi
+
 ###############

 arr_bin=()
@@ -42,7 +49,7 @@ add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
 add_test "llama-mtmd-cli"  "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
-add_test "llama-mtmd-cli"  "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"           "vicuna"
+add_test "llama-mtmd-cli"  "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M"         "vicuna"
 add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
@@ -60,10 +67,17 @@ if [ "$RUN_BIG_TESTS" = true ]; then
    add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli"  "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli"  "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
+    add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
+    add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
    # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
-    # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" # too big
+fi
+
+# to test the huge models, run: ./tests.sh huge
+# this will run both the big and huge models
+# huge models are > 32B parameters
+if [ "$RUN_HUGE_TESTS" = true ]; then
+    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
+    add_test "llama-mtmd-cli" "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
 fi

 # these models always give the wrong answer, not sure why
@@ -13,6 +13,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 * Multimodal ([documentation](../../docs/multimodal.md)) / with OpenAI-compatible API support
 * Monitoring endpoints
 * Schema-constrained JSON response format
+ * Prefilling of assistant messages similar to the Claude API
 * [Function calling](../../docs/function-calling.md) / tool use for ~any model
 * Speculative decoding
 * Easy-to-use web UI
@@ -175,6 +176,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)<br/>controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).<br/>only supported for non-streamed responses<br/>(env: LLAMA_ARG_THINK) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 | `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
@@ -2004,6 +2004,23 @@ struct server_context {
            }
        }

+        if (!llama_kv_self_can_shift(ctx)) {
+            if (params_base.ctx_shift) {
+                params_base.ctx_shift = false;
+                SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
+            }
+
+            if (params_base.n_cache_reuse) {
+                params_base.n_cache_reuse = 0;
+                SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
+            }
+
+            if (!params_base.speculative.model.path.empty()) {
+                SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
+                return false;
+            }
+        }
+
        return true;
    }

@@ -2251,6 +2268,14 @@ struct server_context {
            slot.has_next_token = true;
        }

+        // if context shifting is disabled, make sure that we don't run out of context
+        if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
+        }
+
        // check the limits
        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
            slot.stop           = STOP_TYPE_LIMIT;
@@ -3173,7 +3198,15 @@ struct server_context {
                                // if we don't cache the prompt, we have to remove the entire KV cache
                                llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                slot.n_past = 0;
-                                slot.cache_tokens.clear();
+                                slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
+                            }
+
+                            if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
+                                if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
+                                    SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
+                                            "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+                                    slot.n_past = 0;
+                                }
                            }
                        }

@@ -4340,6 +4373,7 @@ int main(int argc, char ** argv) {
        json data = oaicompat_completion_params_parse(
            body,
            params.use_jinja,
+            params.prefill_assistant,
            params.reasoning_format,
            ctx_server.chat_templates.get(),
            ctx_server.mctx,
@@ -4361,6 +4395,7 @@ int main(int argc, char ** argv) {
        json data = oaicompat_completion_params_parse(
            body,
            params.use_jinja,
+            params.prefill_assistant,
            params.reasoning_format,
            ctx_server.chat_templates.get(),
            ctx_server.mctx,
@@ -65,3 +65,21 @@ def test_ctx_shift_disabled_long_prompt():
    assert res.status_code != 200
    assert "error" in res.body
    assert "exceeds the available context size" in res.body["error"]["message"]
+
+def test_ctx_shift_disabled_stream():
+    global server
+    server.disable_ctx_shift = True
+    server.start()
+    res = server.make_stream_request("POST", "/v1/completions", data={
+        "n_predict": 256,
+        "prompt": "Once",
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        choice = data["choices"][0]
+        if choice["finish_reason"] == "length":
+            assert len(content) > 0
+        else:
+            assert choice["finish_reason"] is None
+            content += choice["text"]
@@ -583,6 +583,7 @@ static json oaicompat_completion_params_parse(const json & body) {
 static json oaicompat_completion_params_parse(
    const json & body, /* openai api json semantics */
    bool use_jinja,
+    bool prefill_assistant,
    common_reasoning_format reasoning_format,
    const struct common_chat_templates * tmpls,
    bool allow_non_text,
@@ -732,7 +733,7 @@ static json oaicompat_completion_params_parse(

    // if the assistant message appears at the end of list, we do not add end-of-turn token
    // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
    common_chat_msg last_message;
    if (prefill_assistant_message) {
        last_message = inputs.messages.back();
@@ -28,13 +28,13 @@ function AppLayout() {
  return (
    <>
      <Sidebar />
-      <div
+      <main
        className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto bg-base-100"
        id="main-scroll"
      >
        <Header />
        <Outlet />
-      </div>
+      </main>
      {
        <SettingDialog
          show={showSettings}
@@ -18,16 +18,26 @@ export default function ChatInputExtraContextItem({
  if (!items) return null;

  return (
-    <div className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1">
+    <div
+      className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1"
+      role="group"
+      aria-description="Selected files"
+    >
      {items.map((item, i) => (
        <div
          className="indicator"
          key={i}
          onClick={() => clickToShow && setShow(i)}
+          tabIndex={0}
+          aria-description={
+            clickToShow ? `Click to show: ${item.name}` : undefined
+          }
+          role={clickToShow ? 'button' : 'menuitem'}
        >
          {removeItem && (
            <div className="indicator-item indicator-top">
              <button
+                aria-label="Remove file"
                className="btn btn-neutral btn-sm w-4 h-4 p-0 rounded-full"
                onClick={() => removeItem(i)}
              >
@@ -46,13 +56,16 @@ export default function ChatInputExtraContextItem({
              <>
                <img
                  src={item.base64Url}
-                  alt={item.name}
+                  alt={`Preview image for ${item.name}`}
                  className="w-14 h-14 object-cover rounded-md"
                />
              </>
            ) : (
              <>
-                <div className="w-14 h-14 flex items-center justify-center">
+                <div
+                  className="w-14 h-14 flex items-center justify-center"
+                  aria-description="Document icon"
+                >
                  <DocumentTextIcon className="h-8 w-14 text-base-content/50" />
                </div>

@@ -66,16 +79,25 @@ export default function ChatInputExtraContextItem({
      ))}

      {showingItem && (
-        <dialog className="modal modal-open">
+        <dialog
+          className="modal modal-open"
+          aria-description={`Preview ${showingItem.name}`}
+        >
          <div className="modal-box">
            <div className="flex justify-between items-center mb-4">
              <b>{showingItem.name ?? 'Extra content'}</b>
-              <button className="btn btn-ghost btn-sm">
+              <button
+                className="btn btn-ghost btn-sm"
+                aria-label="Close preview dialog"
+              >
                <XMarkIcon className="h-5 w-5" onClick={() => setShow(-1)} />
              </button>
            </div>
            {showingItem.type === 'imageFile' ? (
-              <img src={showingItem.base64Url} alt={showingItem.name} />
+              <img
+                src={showingItem.base64Url}
+                alt={`Preview image for ${showingItem.name}`}
+              />
            ) : (
              <div className="overflow-x-auto">
                <pre className="whitespace-pre-wrap break-words text-sm">
@@ -83,13 +83,20 @@ export default function ChatMessage({

  if (!viewingChat) return null;

+  const isUser = msg.role === 'user';
+
  return (
-    <div className="group" id={id}>
+    <div
+      className="group"
+      id={id}
+      role="group"
+      aria-description={`Message from ${msg.role}`}
+    >
      <div
        className={classNames({
          chat: true,
-          'chat-start': msg.role !== 'user',
-          'chat-end': msg.role === 'user',
+          'chat-start': !isUser,
+          'chat-end': isUser,
        })}
      >
        {msg.extra && msg.extra.length > 0 && (
@@ -99,7 +106,7 @@ export default function ChatMessage({
        <div
          className={classNames({
            'chat-bubble markdown': true,
-            'chat-bubble bg-transparent': msg.role !== 'user',
+            'chat-bubble bg-transparent': !isUser,
          })}
        >
          {/* textarea for editing message */}
@@ -142,7 +149,7 @@ export default function ChatMessage({
              ) : (
                <>
                  {/* render message as markdown */}
-                  <div dir="auto">
+                  <div dir="auto" tabIndex={0}>
                    {thought && (
                      <ThoughtProcess
                        isThinking={!!isThinking && !!isPending}
@@ -196,13 +203,18 @@ export default function ChatMessage({
          })}
        >
          {siblingLeafNodeIds && siblingLeafNodeIds.length > 1 && (
-            <div className="flex gap-1 items-center opacity-60 text-sm">
+            <div
+              className="flex gap-1 items-center opacity-60 text-sm"
+              role="navigation"
+              aria-description={`Message version ${siblingCurrIdx + 1} of ${siblingLeafNodeIds.length}`}
+            >
              <button
                className={classNames({
                  'btn btn-sm btn-ghost p-1': true,
                  'opacity-20': !prevSibling,
                })}
                onClick={() => prevSibling && onChangeSibling(prevSibling)}
+                aria-label="Previous message version"
              >
                <ChevronLeftIcon className="h-4 w-4" />
              </button>
@@ -215,6 +227,7 @@ export default function ChatMessage({
                  'opacity-20': !nextSibling,
                })}
                onClick={() => nextSibling && onChangeSibling(nextSibling)}
+                aria-label="Next message version"
              >
                <ChevronRightIcon className="h-4 w-4" />
              </button>
@@ -223,7 +236,7 @@ export default function ChatMessage({
          {/* user message */}
          {msg.role === 'user' && (
            <BtnWithTooltips
-              className="btn-mini show-on-hover w-8 h-8"
+              className="btn-mini w-8 h-8"
              onClick={() => setEditingContent(msg.content)}
              disabled={msg.content === null}
              tooltipsContent="Edit message"
@@ -236,7 +249,7 @@ export default function ChatMessage({
            <>
              {!isPending && (
                <BtnWithTooltips
-                  className="btn-mini show-on-hover w-8 h-8"
+                  className="btn-mini w-8 h-8"
                  onClick={() => {
                    if (msg.content !== null) {
                      onRegenerateMessage(msg as Message);
@@ -250,10 +263,7 @@ export default function ChatMessage({
              )}
            </>
          )}
-          <CopyButton
-            className="btn-mini show-on-hover w-8 h-8"
-            content={msg.content}
-          />
+          <CopyButton className="btn-mini w-8 h-8" content={msg.content} />
        </div>
      )}
    </div>
@@ -271,6 +281,8 @@ function ThoughtProcess({
 }) {
  return (
    <div
+      role="button"
+      aria-label="Toggle thought process display"
      tabIndex={0}
      className={classNames({
        'collapse bg-none': true,
@@ -292,7 +304,11 @@ function ThoughtProcess({
          )}
        </div>
      </div>
-      <div className="collapse-content text-base-content/70 text-sm p-1">
+      <div
+        className="collapse-content text-base-content/70 text-sm p-1"
+        tabIndex={0}
+        aria-description="Thought process content"
+      >
        <div className="border-l-2 border-base-content/20 pl-4 mb-4">
          <MarkdownDisplay content={content} />
        </div>
@@ -279,7 +279,11 @@ export default function ChatScreen() {
 function ServerInfo() {
  const { serverProps } = useAppContext();
  return (
-    <div className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6">
+    <div
+      className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
+      tabIndex={0}
+      aria-description="Server information"
+    >
      <div className="card-body">
        <b>Server Info</b>
        <p>
@@ -311,6 +315,8 @@ function ChatInput({

  return (
    <div
+      role="group"
+      aria-label="Chat input"
      className={classNames({
        'flex items-end pt-8 pb-6 sticky bottom-0 bg-base-100': true,
        'opacity-50': isDrag, // simply visual feedback to inform user that the file will be accepted
@@ -400,13 +406,15 @@ function ChatInput({
                    'btn w-8 h-8 p-0 rounded-full': true,
                    'btn-disabled': isGenerating,
                  })}
+                  aria-label="Upload file"
+                  tabIndex={0}
+                  role="button"
                >
                  <PaperClipIcon className="h-5 w-5" />
                </label>
                <input
                  id="file-upload"
                  type="file"
-                  className="hidden"
                  disabled={isGenerating}
                  {...getInputProps()}
                  hidden
@@ -422,6 +430,7 @@ function ChatInput({
                  <button
                    className="btn btn-primary w-8 h-8 p-0 rounded-full"
                    onClick={onSend}
+                    aria-label="Send message"
                  >
                    <ArrowUpIcon className="h-5 w-5" />
                  </button>
@@ -38,8 +38,12 @@ export default function Header() {

      {/* action buttons (top right) */}
      <div className="flex items-center">
-        <div className="tooltip tooltip-bottom" data-tip="Settings">
-          <button className="btn" onClick={() => setShowSettings(true)}>
+        <div
+          className="tooltip tooltip-bottom"
+          data-tip="Settings"
+          onClick={() => setShowSettings(true)}
+        >
+          <button className="btn" aria-hidden={true}>
            {/* settings button */}
            <Cog8ToothIcon className="w-5 h-5" />
          </button>
@@ -335,14 +335,22 @@ export default function SettingDialog({
  };

  return (
-    <dialog className={classNames({ modal: true, 'modal-open': show })}>
+    <dialog
+      className={classNames({ modal: true, 'modal-open': show })}
+      aria-label="Settings dialog"
+    >
      <div className="modal-box w-11/12 max-w-3xl">
        <h3 className="text-lg font-bold mb-6">Settings</h3>
        <div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
          {/* Left panel, showing sections - Desktop version */}
-          <div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
+          <div
+            className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200"
+            role="complementary"
+            aria-description="Settings sections"
+            tabIndex={0}
+          >
            {SETTING_SECTIONS.map((section, idx) => (
-              <div
+              <button
                key={idx}
                className={classNames({
                  'btn btn-ghost justify-start font-normal w-44 mb-1': true,
@@ -352,12 +360,16 @@ export default function SettingDialog({
                dir="auto"
              >
                {section.title}
-              </div>
+              </button>
            ))}
          </div>

          {/* Left panel, showing sections - Mobile version */}
-          <div className="md:hidden flex flex-row gap-2 mb-4">
+          {/* This menu is skipped on a11y, otherwise it's repeated the desktop version */}
+          <div
+            className="md:hidden flex flex-row gap-2 mb-4"
+            aria-disabled={true}
+          >
            <details className="dropdown">
              <summary className="btn bt-sm w-full m-1">
                {SETTING_SECTIONS[sectionIdx].title}
@@ -50,44 +50,72 @@ export default function Sidebar() {
        id="toggle-drawer"
        type="checkbox"
        className="drawer-toggle"
+        aria-label="Toggle sidebar"
        defaultChecked
      />

-      <div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
+      <div
+        className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64"
+        role="complementary"
+        aria-label="Sidebar"
+        tabIndex={0}
+      >
        <label
          htmlFor="toggle-drawer"
-          aria-label="close sidebar"
+          aria-label="Close sidebar"
          className="drawer-overlay"
        ></label>
+
+        <a
+          href="#main-scroll"
+          className="absolute -left-80 top-0 w-1 h-1 overflow-hidden"
+        >
+          Skip to main content
+        </a>
+
        <div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
          <div className="flex flex-row items-center justify-between mb-4 mt-4">
-            <h2 className="font-bold ml-4">Conversations</h2>
+            <h2 className="font-bold ml-4" role="heading">
+              Conversations
+            </h2>

            {/* close sidebar button */}
-            <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
+            <label
+              htmlFor="toggle-drawer"
+              className="btn btn-ghost lg:hidden"
+              aria-label="Close sidebar"
+              role="button"
+              tabIndex={0}
+            >
              <XMarkIcon className="w-5 h-5" />
            </label>
          </div>

          {/* new conversation button */}
-          <div
+          <button
            className={classNames({
              'btn btn-ghost justify-start px-2': true,
              'btn-soft': !currConv,
            })}
            onClick={() => navigate('/')}
+            aria-label="New conversation"
          >
            <PencilSquareIcon className="w-5 h-5" />
            New conversation
-          </div>
+          </button>

          {/* list of conversations */}
          {groupedConv.map((group, i) => (
-            <div key={i}>
+            <div key={i} role="group">
              {/* group name (by date) */}
              {group.title ? (
                // we use btn class here to make sure that the padding/margin are aligned with the other items
-                <b className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold">
+                <b
+                  className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold"
+                  role="note"
+                  aria-description={group.title}
+                  tabIndex={0}
+                >
                  {group.title}
                </b>
              ) : (
@@ -184,20 +212,23 @@ function ConversationItem({
 }) {
  return (
    <div
+      role="menuitem"
+      tabIndex={0}
+      aria-label={conv.name}
      className={classNames({
        'group flex flex-row btn btn-ghost justify-start items-center font-normal px-2 h-9':
          true,
        'btn-soft': isCurrConv,
      })}
    >
-      <div
+      <button
        key={conv.id}
        className="w-full overflow-hidden truncate text-start"
        onClick={onSelect}
        dir="auto"
      >
        {conv.name}
-      </div>
+      </button>
      <div className="dropdown dropdown-end h-5">
        <BtnWithTooltips
          // on mobile, we always show the ellipsis icon
@@ -211,22 +242,23 @@ function ConversationItem({
        </BtnWithTooltips>
        {/* dropdown menu */}
        <ul
+          aria-label="More options"
          tabIndex={0}
          className="dropdown-content menu bg-base-100 rounded-box z-[1] p-2 shadow"
        >
-          <li onClick={onRename}>
+          <li onClick={onRename} tabIndex={0}>
            <a>
              <PencilIcon className="w-4 h-4" />
              Rename
            </a>
          </li>
-          <li onClick={onDownload}>
+          <li onClick={onDownload} tabIndex={0}>
            <a>
              <ArrowDownTrayIcon className="w-4 h-4" />
              Download
            </a>
          </li>
-          <li className="text-error" onClick={onDelete}>
+          <li className="text-error" onClick={onDelete} tabIndex={0}>
            <a>
              <TrashIcon className="w-4 h-4" />
              Delete
@@ -34,9 +34,6 @@ html {
  /* TODO: fix markdown table */
 }

-.show-on-hover {
-  @apply md:opacity-0 md:group-hover:opacity-100;
-}
 .btn-mini {
  @apply cursor-pointer;
 }
@@ -52,13 +52,20 @@ export function BtnWithTooltips({
  tooltipsContent: string;
  disabled?: boolean;
 }) {
+  // the onClick handler is on the container, so screen readers can safely ignore the inner button
+  // this prevents the label from being read twice
  return (
-    <div className="tooltip tooltip-bottom" data-tip={tooltipsContent}>
+    <div
+      className="tooltip tooltip-bottom"
+      data-tip={tooltipsContent}
+      role="button"
+      onClick={onClick}
+    >
      <button
        className={`${className ?? ''} flex items-center justify-center`}
-        onClick={onClick}
        disabled={disabled}
        onMouseLeave={onMouseLeave}
+        aria-hidden={true}
      >
        {children}
      </button>
Author	SHA1	Message	Date
l3utterfly	b7a17463ec	mtmd-helper : bug fix to token batching in mtmd (#13650 ) * Update mtmd-helper.cpp * Update tools/mtmd/mtmd-helper.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>	2025-05-20 18:55:30 +02:00
Georgi Gerganov	be0239693c	model : fix llama4 graph (#13663 ) ggml-ci	2025-05-20 19:21:04 +03:00
Georgi Gerganov	a4090d1174	llama : remove llama_kv_cache_view API + remove deprecated (#13653 ) ggml-ci	2025-05-20 16:13:16 +03:00
Johannes Gäßler	b69f1647f9	CUDA: skip fully masked-out KV in FA vec kernel (#13584 ) * CUDA: skip fully masked-out KV in FA vec kernel	2025-05-20 14:45:07 +02:00
Sigbjørn Skjæret	759e37b0d8	tests : avoid github urls due to throttling (#13654 )	2025-05-20 12:03:17 +02:00
Svetlozar Georgiev	4245e622e0	sycl: disable reorder for sycl mulmat (#13536 )	2025-05-20 11:34:15 +02:00
0cc4m	c9c64dee57	Set GLM4 blk..attn_output.weight, kqv_out- matmul to GGML_PREC_F32 to fix infinity values in output (#13639 )	2025-05-20 10:11:56 +02:00
Georgi Gerganov	c00a2634be	metal : fix typo in FA kernel comments (#13651 )	2025-05-20 10:41:40 +03:00
Georgi Gerganov	e298d2fbd0	kv-cache : add SWA support (#13194 ) * kv-cache : prepare for SWA ggml-ci * kv-cache : initial iSWA implementation ggml-ci * kv-cache : rework error recovery logic ggml-ci * models : fix Phi-3 SWA parameters ggml-ci * model : adjust Granite to rope factor changes ggml-ci * server : check if context can do shifts ggml-ci * iswa : for now, always enable shifts (experiment) ggml-ci * kv-cache : simplify SWA logic ggml-ci * kv-cache : apply defrag when we fail to find slots for the batch ggml-ci * llama : update docs about llama_decode ggml-ci * kv-cache : update warning logs when no space for the batch is available ggml-ci * llama : add llama_kv_self_seq_pos_min() * kv-cache : keep track of partial SWA computes and print warnings * server : disallow use cases involving partial SWA context ggml-ci * llama : add param to control SWA cache size ggml-ci * minor : clean-up ggml-ci	2025-05-20 08:05:46 +03:00
Xinpeng Dou	f0adb80bf7	CANN: Update CANN model support (#13162 ) * Update CANN model support status * Update of model support * update * update * update * fix format of CANN.md * fix format of CANN.md * fix format of CANN.md	2025-05-20 11:43:43 +08:00
Nicolò Scipione	f7c9429c85	sycl : Overcoming workaround for mmap() allocation on Windows (#13482 ) * Remove mmap workaround on windows After some testing I found that mmap is supported on windows and for many GPUs on Linux. Therefore I remove the workaround for windows since it is not necessary. * Update llama-bench README SYCL backend introduced a workaround that allows execution of llama-bench also without specifying `--mmp 0` flag	2025-05-20 08:54:43 +08:00
psocolovsky	1dfbf2cf3a	common : add load_progress_callback (#13617 )	2025-05-19 21:17:36 +02:00
0cc4m	8960efd0a6	Vulkan: Add f32 accumulator support to quantized mul mat to fix GLM4 32B incoherence (#13607 )	2025-05-19 17:54:08 +02:00
Alberto Cabrera Pérez	725f23f1f3	sycl : backend documentation review (#13544 ) * sycl: reviewing and updating docs * Updates Runtime error codes * Improves OOM troubleshooting entry * Added a llama 3 sample * Updated supported models * Updated releases table	2025-05-19 14:38:20 +01:00
Xuan-Son Nguyen	92ecdcc06a	mtmd : add vision support for llama 4 (#13282 ) * wip llama 4 conversion * rm redundant __init__ * fix conversion * fix conversion * test impl * try this * reshape patch_embeddings_0 * fix view * rm ffn_post_norm * cgraph ok * f32 for pos embd * add image marker tokens * Llama4UnfoldConvolution * correct pixel shuffle * fix merge conflicts * correct * add debug_graph * logits matched, but it still preceives the image incorrectly * fix style * add image_grid_pinpoints * handle llama 4 preprocessing * rm load_image_size * rm unused line * fix * small fix 2 * add test & docs * fix llava-1.6 test * test: add notion of huge models * add comment * add warn about degraded quality	2025-05-19 13:04:14 +02:00
Alberto Cabrera Pérez	f71f40a284	ci : upgraded oneAPI version in SYCL workflows and dockerfile (#13532 )	2025-05-19 11:46:09 +01:00
Georgi Gerganov	d30cb5a7fa	sync : ggml ggml-ci	2025-05-19 13:29:56 +03:00
Johannes Gäßler	6c35981a64	mnist: fix segmentation fault (ggml/1227)	2025-05-19 13:29:56 +03:00
Diego Devesa	8b5e19aea6	ggml : fix apple OS check in ggml_print_backtrace (ggml/1229)	2025-05-19 13:29:56 +03:00
Daniel Tang	60aea028b5	ggml : Fix missing backtrace on Linux (ggml/1228) * Modern Linux defaults /proc/sys/kernel/yama/ptrace_scope to 1 * Fixed lldb attach * Simplify by having the child do ggml_print_backtrace_symbols	2025-05-19 13:29:56 +03:00
Nick	9c55e5c5c2	fix: check model pointer validity before use (#13631 )	2025-05-19 13:25:41 +03:00
Chenguang Li	33d7aed4a8	CANN: Support MOE Model MUL_MAT_ID (#13042 ) Signed-off-by: noemotiovon <757486878@qq.com>	2025-05-19 14:21:17 +08:00
Isaac McFadyen	6a2bc8bfb7	server : added --no-prefill-assistant flag (#13608 ) * added no-prefill-assistant flag * reworded documentation comment * updated server README.md	2025-05-17 23:59:48 +02:00
Gilad S.	e3a7cf6c5b	cmake: use the current build config for vulkan-shaders-gen (#13595 ) * fix: use the current build config for `vulkan-shaders-gen` * fix: only pass a valid build type to `--config`	2025-05-17 15:26:43 -03:00
Georgi Gerganov	518329b2d4	parallel : add option for non-shared and larger prompts (#13598 ) * parallel : add option for non-shared and larger prompts * parallel : update readme [no ci] * cont : add note about base models [no ci] * parallel : better var name ggml-ci	2025-05-17 12:58:55 +03:00
Jeff Bolz	2f5a4e1e09	vulkan: move common FA code to flash_attn_base.comp (#13556 ) * vulkan: move common FA code to flash_attn_base.comp * vulkan: move common FA index/stride setup code to flash_attn_base.comp * build fix	2025-05-17 09:14:55 +02:00
Jeff Bolz	4f41ee11d6	vulkan: use scalar FA rather than coopmat2 when N==1 (#13554 )	2025-05-17 08:35:47 +02:00
Z	3e0be1cace	llguidance : official v0.7.20 release (no actual changes) [noci] (#13594 )	2025-05-16 22:56:28 +02:00
Xuan-Son Nguyen	6aa892ec2a	server : do not return error out of context (with ctx shift disabled) (#13577 )	2025-05-16 21:50:00 +02:00
Xuan-Son Nguyen	aea9f8b4e7	webui : improve accessibility for visually impaired people (#13551 ) * webui : improve accessibility for visually impaired people * add a11y for extra contents * fix some labels being read twice * add skip to main content	2025-05-16 21:49:01 +02:00
Xuan-Son Nguyen	06c1e4abc1	readme : add list of dependencies and their license (#13591 )	2025-05-16 20:04:18 +02:00
Diego Devesa	415e40a357	releases : use arm version of curl for arm releases (#13592 )	2025-05-16 19:36:51 +02:00