vulkan: prefer host-visible memory buffers on UMA devices (#22930 )

* implement UMA host-visible memory * update based on 0cc4m's suggestion
vulkan: Support gated_delta_net with S_v=16 (#24581 )
2026-06-16 10:46:43 +02:00 · 2026-06-16 09:36:52 +02:00 · 2026-06-16 09:26:57 +02:00 · 2026-06-16 10:23:09 +03:00 · 2026-06-16 08:26:05 +02:00 · 2026-06-16 08:35:00 +03:00
35 changed files with 1043 additions and 288 deletions
@@ -7,7 +7,7 @@ ARG APP_REVISION=N/A

 FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build

-ARG GGML_SYCL_F16=OFF
+ARG GGML_SYCL_F16=ON
 ARG LEVEL_ZERO_VERSION=1.28.2
 ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
 RUN apt-get update && \
@@ -24,7 +24,8 @@ COPY . .

 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
-        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
+        && export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
@@ -37,7 +37,7 @@ LLM inference in C/C++

 Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:

- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
+- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md)
 - Run with Docker - see our [Docker documentation](docs/docker.md)
 - Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
 - Build from source by cloning this repository - check out [our build guide](docs/build.md)
@@ -103,6 +103,10 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
            data.grammar_triggers = {
                { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
            };
+            if (autoparser.tools.format.openai_wrapper_trigger) {
+                // model emits the OpenAI function wrapper, trigger on it
+                data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "{\"type\": \"function\"," });
+            }
        }
    }

@@ -224,13 +228,13 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        auto single_tool_parser = p.standard_json_tools(
            format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
        tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
    } else {
        tools_parser = p.standard_json_tools(
            format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
    }

    // Handle content wrappers if present
@@ -181,6 +181,7 @@ struct tool_format_analysis {

    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
+    bool openai_wrapper_trigger = false;  // model emits the OpenAI function wrapper, trigger on it

    std::string              function_field = "function";
    std::string              name_field     = "name";
@@ -165,6 +165,14 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
          }
      },
+      // template uses the JSON {name, parameters} tool instruction, emits the OpenAI function wrapper
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("Respond in the format {\"name\": function name") != std::string::npos &&
+              tmpl.src.find("Do not use variables.") != std::string::npos) {
+              analysis.tools.format.openai_wrapper_trigger = true;
+              LOG_DBG(ANSI_ORANGE "[Patch: JSON name/parameters tool instruction]\n" ANSI_RESET);
+          }
+      },

    });

@@ -540,10 +540,11 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
                auto arg_name_parser = literal(prop_name);

                common_peg_parser arg_value_parser = eps();
-                auto string_value_parser = choice({
-                    literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
-                    literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
-                });
+                // Quoted literal as a value: normalize_quotes_to_json preserves escapes.
+                auto string_value_parser = tool_arg_value(choice({
+                    literal("\"") + string_content('"') + literal("\""),
+                    literal("'") + string_content('\'') + literal("'")
+                }));

                if (is_string_type) {
                    arg_value_parser = string_value_parser;
@@ -745,7 +746,8 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
    const std::string &              effective_args_key,
    const std::string &              call_id_key,
    const std::string &              gen_call_id_key,
-    const std::vector<std::string> & parameters_order) {
+    const std::vector<std::string> & parameters_order,
+    bool                             accept_openai_wrapper) {

    auto tool_choices    = choice();
    auto name_key_parser = literal("\"" + effective_name_key + "\"");
@@ -807,7 +809,13 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
                return idx_a < idx_b;
            });

-        auto ordered_body = tool_open(literal("{")) + space();
+        // accept an optional leading "type": "function" field when the model emits the OpenAI wrapper
+        common_peg_parser type_field = eps();
+        if (accept_openai_wrapper) {
+            type_field = optional(literal("\"type\"") + space() + literal(":") + space() +
+                                  literal("\"function\"") + space() + literal(",") + space());
+        }
+        auto ordered_body = tool_open(literal("{")) + space() + type_field;
        for (size_t i = 0; i < parser_pairs.size(); i++) {
            ordered_body = ordered_body + parser_pairs[i].first;
            if (i < parser_pairs.size() - 1) {
@@ -870,7 +878,8 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       bool                             function_is_key,
                                                       const std::string &              call_id_key,
                                                       const std::string &              gen_call_id_key,
-                                                       const std::vector<std::string> & parameters_order) {
+                                                       const std::vector<std::string> & parameters_order,
+                                                       bool                             accept_openai_wrapper) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
@@ -888,7 +897,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
        if (!name_spec.first.empty() || !args_spec.first.empty()) {
            tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
        } else {
-            tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
+            tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order, accept_openai_wrapper);
        }
    }

@@ -120,7 +120,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
                                          bool                             function_is_key = false,
                                          const std::string &              call_id_key = "",
                                          const std::string &              gen_call_id_key = "",
-                                          const std::vector<std::string> & parameters_order = {});
+                                          const std::vector<std::string> & parameters_order = {},
+                                          bool                             accept_openai_wrapper = false);

    // Legacy-compatible helper for building XML/tagged style tool calls
    // Used by tests and manual parsers
@@ -157,7 +158,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
                                                 const std::string &              effective_args_key,
                                                 const std::string &              call_id_key,
                                                 const std::string &              gen_call_id_key,
-                                                 const std::vector<std::string> & parameters_order);
+                                                 const std::vector<std::string> & parameters_order,
+                                                 bool                             accept_openai_wrapper);
 };

 inline common_peg_arena build_chat_peg_parser(
@@ -2678,8 +2678,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
            }
            return msg;
        }
-        throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
-                                 effective_input.substr(result.end));
+        LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str());
+        LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str());
+        throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format");
    }

    common_chat_msg msg;
@@ -1507,6 +1507,7 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
    auto pieces = matcher.collect_prefix_and_next();

    std::string pattern;
+    std::string trailing;  // optional proper-prefix of a delimiter, allowed only at the very end
    for (size_t i = 0; i < pieces.size(); ++i) {
        if (i > 0) {
            pattern += " | ";
@@ -1522,13 +1523,32 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
        }

        if (!pre.empty()) {
-            pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
+            std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
+            pattern += pre_literal + " [^" + cls + "]";
+            // Each interior alternative consumes a delimiter-prefix plus a disambiguating
+            // char, so the repetition alone cannot match a value that *ends* on a proper
+            // prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
+            // "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
+            // values, so without this the grammar would reject input the parser accepts.
+            // Allow the value to terminate on any proper prefix as an optional tail.
+            // This makes the grammar a slight superset of the runtime language (a value
+            // may end on the longest prefix, which greedy first-match would not itself
+            // produce); harmless for constrained generation, which only needs to admit
+            // every runtime-valid string.
+            if (!trailing.empty()) {
+                trailing += " | ";
+            }
+            trailing += pre_literal;
        } else {
            pattern += "[^" + cls + "]";
        }
    }

-    return "(" + pattern + ")*";
+    std::string result = "(" + pattern + ")*";
+    if (!trailing.empty()) {
+        result += " (" + trailing + ")?";
+    }
+    return result;
 }

 static std::unordered_set<std::string> collect_reachable_rules(
@@ -140,6 +140,8 @@ struct common_speculative_impl {
    size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
    size_t n_acc_tokens = 0; // number of tokens accepted by the target model.

+    std::vector<size_t> n_acc_tokens_per_pos; // number of tokens accepted per draft position.
+
    // TODO: track performance of most recent calls
    const bool gen_perf = true; // whether to generate performance stats.

@@ -2059,6 +2061,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u

    {
        common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
+
+        if (impl->n_acc_tokens_per_pos.size() < n_accepted) {
+            impl->n_acc_tokens_per_pos.resize(n_accepted, 0);
+        }
+
+        for (size_t i = 0; i < n_accepted; ++i) {
+            impl->n_acc_tokens_per_pos[i]++;
+        }
+
        if (n_accepted > 0) {
            impl->n_acc_drafts++;
            impl->n_acc_tokens += n_accepted;
@@ -2093,13 +2104,31 @@ void common_speculative_print_stats(const common_speculative * spec) {
            str_perf = "";
        }

-        LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
+        std::string str_stats;
+        if (impl->n_call_accept > 0) {
+            const double mean =
+                1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept;
+            std::ostringstream tmp;
+            tmp << std::fixed << std::setprecision(3);
+            for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) {
+                if (i > 0) {
+                    tmp << ", ";
+                }
+                tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept;
+            }
+            std::ostringstream oss;
+            oss << std::fixed << std::setprecision(2) << mean;
+            str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
+        }
+
+        LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
                common_speculative_type_to_str(impl->type).c_str(),
                impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
                impl->n_gen_drafts,
                impl->n_acc_drafts,
                impl->n_gen_tokens,
                impl->n_acc_tokens,
+                str_stats.c_str(),
                str_perf.c_str());
    }
 }
@@ -253,6 +253,7 @@ When targeting an intel GPU, the user should expect one or more devices among th
 #### Intel GPU

 ```sh
+# Uses FP32, consider using FP16 for better performance in most cases
 ./examples/sycl/build.sh
 ```

@@ -262,12 +263,12 @@ or
 # Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh

-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
+# Option 1: Use FP16 (recommended for better performance in most cases)
 cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON

+# Option 2: Use FP32
+cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
 # build all binary
 cmake --build build --config Release -j -v
 ```
@@ -469,6 +470,7 @@ Choose one of following methods to build from source code.
 ##### Option 1: Script

 ```sh
+# Uses FP32, consider using FP16 for better performance in most cases
 .\examples\sycl\win-build-sycl.bat
 ```

@@ -479,11 +481,11 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
 ```
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+# Option 1: Use FP16 (recommended for better performance in most cases)
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON

-# Option 2: Or FP16
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+# Option 2: Or FP32
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release

 cmake --build build --config Release -j
 ```
@@ -491,10 +493,10 @@ cmake --build build --config Release -j
 Or, use CMake presets to build:

 ```sh
-cmake --preset x64-windows-sycl-release
+cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-completion

-cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
+cmake --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-completion

 cmake --preset x64-windows-sycl-debug
@@ -1,12 +1,40 @@
 # Install pre-built version of llama.cpp

-| Install via | Windows | Mac | Linux |
-|-------------|---------|-----|-------|
+| Install via | Windows | Mac  | Linux |
+|-------------|---------|------|-------|
+| conda-forge | ✅      | ✅   | ✅   |
 | Winget      | ✅      |      |      |
 | Homebrew    |         | ✅   | ✅   |
 | MacPorts    |         | ✅   |      |
 | Nix         |         | ✅   | ✅   |

+## conda-forge (Windows, Mac and Linux)
+
+conda-forge provides builds for:
+ - CUDA (Windows and Linux)
+ - Vulkan (Windows and Linux)
+ - Apple Metal (macOS)
+
+```sh
+conda install -c conda-forge llama-cpp
+```
+
+```sh
+mamba install -c conda-forge llama-cpp
+```
+
+```sh
+# Project-local installation
+pixi add llama-cpp
+
+# Global installation
+pixi global install llama-cpp
+```
+
+This distribution is managed on [`conda-forge/llama-cpp-feedstock`](https://github.com/conda-forge/llama.cpp-feedstock/).
+
+Shall you have any problems, please open an issue on [its issue tracker](https://github.com/conda-forge/llama.cpp-feedstock/issues).
+
 ## Winget (Windows)

 ```sh
@@ -44,10 +44,10 @@ Legend:
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -89,7 +89,7 @@ Legend:
 |                             ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -118,6 +118,6 @@ Legend:
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
@@ -27,20 +27,20 @@
 "SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
-"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
-"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
+"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
-"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
-"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
+"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
-"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
-"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
+"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
@@ -69,20 +69,20 @@
 "SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
-"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
+"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
-"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
+"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
-"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
+"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@@ -111,8 +111,8 @@
 "SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
-"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
-"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
+"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@@ -153,20 +153,20 @@
 "SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
-"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
+"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
-"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
+"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
-"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
+"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","SYCL"
 "SYCL0","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","SYCL"
 "SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","SYCL"
@@ -5105,6 +5105,7 @@
 "SYCL0","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL"
 "SYCL0","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL"
 "SYCL0","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL"
+"SYCL0","REPEAT","type=bf16,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL"
 "SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","SYCL"
 "SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
 "SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","SYCL"
@@ -5112,6 +5113,7 @@
 "SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL"
 "SYCL0","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
 "SYCL0","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL"
+"SYCL0","REPEAT","type=bf16,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
 "SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","1","yes","SYCL"
 "SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","1","yes","SYCL"
 "SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","1","yes","SYCL"
@@ -9748,10 +9750,10 @@
 "SYCL0","COS","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
 "SYCL0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","0","no","SYCL"
 "SYCL0","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","SYCL"
-"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
+"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
-"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
+"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
 "SYCL0","SQR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
 "SYCL0","SQR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
 "SYCL0","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
@@ -9766,14 +9768,14 @@
 "SYCL0","CLAMP","type=f16,ne=[1024,1024,1,1],min=-0.500000,max=0.500000","support","0","no","SYCL"
 "SYCL0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","SYCL"
 "SYCL0","LEAKY_RELU","type=f16,ne_a=[1024,1024,1,1],negative_slope=0.100000","support","1","yes","SYCL"
-"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
-"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
+"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
+"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
 "SYCL0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
-"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
-"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
-"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
+"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
+"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
+"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
 "SYCL0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
 "SYCL0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","SYCL"
 "SYCL0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
@@ -287,6 +287,13 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
             ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
             nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
             main_stream);
+#ifdef GGML_SYCL_HAS_BF16
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) {
+        op()((const sycl::ext::oneapi::bfloat16 *) src0->data, (const sycl::ext::oneapi::bfloat16 *) src1->data,
+             (sycl::ext::oneapi::bfloat16 *) dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2,
+             ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, ggml_is_contiguous(src0),
+             ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
+#endif
    } else {
        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
                ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -10,6 +10,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //

+#include "ggml.h"
+
 #include "concat.hpp"

 static inline size_t elem_size(ggml_type t) {
@@ -192,11 +194,29 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    case GGML_TYPE_F32:
        concat_impl_sycl<float>(ctx, dst);
        break;
+    case GGML_TYPE_F16:
+        concat_impl_sycl<sycl::half>(ctx, dst);
+        break;
+#ifdef GGML_SYCL_HAS_BF16
+    case GGML_TYPE_BF16:
+        concat_impl_sycl<sycl::ext::oneapi::bfloat16>(ctx, dst);
+        break;
+#endif
    case GGML_TYPE_I32:
        concat_impl_sycl<int32_t>(ctx, dst);
        break;
+    case GGML_TYPE_I16:
+        concat_impl_sycl<int16_t>(ctx, dst);
+        break;
+    case GGML_TYPE_I64:
+        concat_impl_sycl<int64_t>(ctx, dst);
+        break;
+    case GGML_TYPE_I8:
+        concat_impl_sycl<int8_t>(ctx, dst);
+        break;
    default:
-    GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type");
+        fprintf(stderr, "%s: unsupported types: dst: %s\n", __func__, ggml_type_name(dst->type));
+        GGML_ASSERT(false);
    break;
    }
 }
@@ -1022,6 +1022,120 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
    }
 }

+static void dequantize_mul_mat_vec_q5_k_reorder(const void *__restrict__ vx,
+                                                const float *__restrict__ yy,
+                                                float *__restrict__ dst,
+                                                const int ncols, int nrows,
+                                                const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    // SOA base pointers for the reordered layout:
+    //   [qs: nb * QK_K/2] [qh: nb * QK_K/8] [scales: nb * K_SCALE_SIZE] [dm: nb * sizeof(half2)]
+    const int nb = nrows * num_blocks_per_row;
+    const uint8_t     * qs_base     = (const uint8_t *)vx;
+    const uint8_t     * qh_base     = qs_base + (size_t)nb * (QK_K / 2);
+    const uint8_t     * scales_base = qh_base + (size_t)nb * (QK_K / 8);
+    const sycl::half2 * dm_base     = (const sycl::half2 *)(scales_base + (size_t)nb * K_SCALE_SIZE);
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = item_ct1.get_local_id(2) / 2; // 0...15
+    const int ix = item_ct1.get_local_id(2) % 2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    uint16_t q16[8];
+    const uint8_t * q4 = (const uint8_t *)q16;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+        const int bi = ib0 + i;
+
+        const uint8_t * ql1 = qs_base + bi * (QK_K / 2) + q_offset;
+        const uint8_t * qh  = qh_base + bi * (QK_K / 8) + l0;
+        const float   * y1  = yy + i*QK_K + y_offset;
+        const float   * y2  = y1 + 128;
+
+        const sycl::half2 dm_val = dm_base[bi];
+        const float dall = dm_val[0];
+        const float dmin = dm_val[1];
+
+        const uint16_t * a = (const uint16_t *)(scales_base + bi * K_SCALE_SIZE);
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        const uint16_t * q1 = (const uint16_t *)ql1;
+        const uint16_t * q2 = q1 + 32;
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[8] & 0x0f0f;
+        q16[2] = (q1[0] >> 4) & 0x0f0f;
+        q16[3] = (q1[8] >> 4) & 0x0f0f;
+        q16[4] = q2[0] & 0x0f0f;
+        q16[5] = q2[8] & 0x0f0f;
+        q16[6] = (q2[0] >> 4) & 0x0f0f;
+        q16[7] = (q2[8] >> 4) & 0x0f0f;
+        for (int l = 0; l < n; ++l) {
+            sum.x() +=
+                y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
+                y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
+            sum.y() +=
+                y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
+                y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
+            sum.z() +=
+                y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
+                y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
+            sum.w() +=
+                y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
+                y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
+                       sum.w() * sc[5]) -
+               dmin * smin;
+    }
+#else
+    // The reordered Q5_K layout is only produced for QK_K == 256.
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
 static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
                                        const sycl::nd_item<3> &item_ct1) {

@@ -1599,6 +1713,19 @@ static void dequantize_mul_mat_vec_q4_K_sycl_reorder(const void *vx, const float
        });
 }

+static void dequantize_mul_mat_vec_q5_K_sycl_reorder(const void *vx, const float *y,
+                                                     float *dst, const int ncols,
+                                                     const int nrows,
+                                                     dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q5_k_reorder(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
 static void dequantize_mul_mat_vec_q6_K_sycl_reorder(const void *vx, const float *y,
                                                     float *dst, const int ncols,
                                                     const int nrows,
@@ -1695,7 +1822,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
            }
            break;
        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                dequantize_mul_mat_vec_q5_K_sycl_reorder(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            } else {
+                dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            }
            break;
        case GGML_TYPE_Q6_K:
            if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
@@ -124,6 +124,11 @@ static __dpct_inline__ T op_exp(T x) {
    return sycl::exp(x);
 }

+template<typename T>
+static __dpct_inline__ T op_expm1(T x) {
+    return sycl::expm1(x);
+}
+
 template<typename T>
 static __dpct_inline__ T op_log(T x) {
    if (x <= static_cast<T>(0)) {
@@ -266,13 +271,6 @@ static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl:
    }
 }

-template<typename T>
-static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = op_floor(x[i]);
-    }
-}
-
 template<typename T>
 static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
@@ -280,20 +278,6 @@ static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::
    }
 }

-template<typename T>
-static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = op_round(x[i]);
-    }
-}
-
-template<typename T>
-static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = op_trunc(x[i]);
-    }
-}
-
 template<typename T>
 static void clamp(const T * x, T * dst, const float min, const float max, const int k,
                      const sycl::nd_item<1> &item_ct1) {
@@ -605,6 +589,12 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor
    });
 }

+static inline void ggml_sycl_op_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+        return op_expm1(x);
+    });
+}
+
 static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
@@ -728,16 +718,9 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens
 }

 static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
-        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
-            const int num_blocks = ceil_div(k_elements, 256);
-            stream->parallel_for(
-                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
-                                  sycl::range<1>(256)),
-                [=](sycl::nd_item<1> item_ct1) {
-                    unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1);
-                });
-        });
+    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+        return op_floor(x);
+    });
 }

 static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
@@ -747,29 +730,15 @@ static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tenso
 }

 static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
-        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
-            const int num_blocks = ceil_div(k_elements, 256);
-            stream->parallel_for(
-                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
-                                  sycl::range<1>(256)),
-                [=](sycl::nd_item<1> item_ct1) {
-                    unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1);
-                });
-        });
+    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+        return op_round(x);
+    });
 }

 static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
-        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
-            const int num_blocks = ceil_div(k_elements, 256);
-            stream->parallel_for(
-                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
-                                  sycl::range<1>(256)),
-                [=](sycl::nd_item<1> item_ct1) {
-                    unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1);
-                });
-        });
+    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+        return op_trunc(x);
+    });
 }

 static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
@@ -1018,6 +987,11 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_op_exp(ctx, dst);
 }

+void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    ggml_sycl_op_expm1(ctx, dst);
+}
+
 void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    ggml_sycl_op_log(ctx, dst);
@@ -59,6 +59,8 @@ void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

 void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

+void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
 void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

 void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
@@ -3685,6 +3685,149 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
    return true;
 }

+// Reorder each expert slice into a self-contained SoA layout.
+static bool reorder_qw_q4_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
+    GGML_ASSERT(expert_bytes % sizeof(block_q4_K) == 0);
+    const int    blocks_per_expert = (int) (expert_bytes / sizeof(block_q4_K));
+    const size_t total_bytes       = expert_bytes * (size_t) n_expert;
+
+    sycl_reorder_temp_buffer tmp(stream, total_bytes);
+    if (!tmp) {
+        GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
+        return false;
+    }
+    uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
+
+    sycl::event copy_event;
+    SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
+    if (!g_ggml_sycl_use_async_mem_op) {
+        copy_event.wait();
+    }
+
+    const int total_blocks = blocks_per_expert * (int) n_expert;
+    auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
+        const int          gb   = gb_;
+        const int          e    = gb / blocks_per_expert;
+        const int          ib   = gb % blocks_per_expert;
+        const block_q4_K * x    = (const block_q4_K *) (tmp_buf + (size_t) e * expert_bytes);
+        uint8_t *          base = data_device + (size_t) e * expert_bytes;
+
+        auto * qs_ptr     = base;
+        auto * scales_ptr = qs_ptr + QK_K / 2 * blocks_per_expert;
+        auto * dm_ptr     = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert);
+
+        for (int j = 0; j < QK_K / 2; ++j) {
+            qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
+        }
+        for (int j = 0; j < K_SCALE_SIZE; ++j) {
+            scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
+        }
+        dm_ptr[ib] = x[ib].dm;
+    });
+    if (!g_ggml_sycl_use_async_mem_op) {
+        reorder_event.wait_and_throw();
+    }
+    return true;
+}
+
+// Reorder each Q5_K expert slice into [qs][qh][scales][dm].
+static bool reorder_qw_q5_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
+    GGML_ASSERT(expert_bytes % sizeof(block_q5_K) == 0);
+    const int    blocks_per_expert = (int) (expert_bytes / sizeof(block_q5_K));
+    const size_t total_bytes       = expert_bytes * (size_t) n_expert;
+
+    sycl_reorder_temp_buffer tmp(stream, total_bytes);
+    if (!tmp) {
+        GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
+        return false;
+    }
+    uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
+
+    sycl::event copy_event;
+    SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
+    if (!g_ggml_sycl_use_async_mem_op) {
+        copy_event.wait();
+    }
+
+    const int total_blocks = blocks_per_expert * (int) n_expert;
+    auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
+        const int          gb   = gb_;
+        const int          e    = gb / blocks_per_expert;
+        const int          ib   = gb % blocks_per_expert;
+        const block_q5_K * x    = (const block_q5_K *) (tmp_buf + (size_t) e * expert_bytes);
+        uint8_t *          base = data_device + (size_t) e * expert_bytes;
+
+        auto * qs_ptr     = base;
+        auto * qh_ptr     = qs_ptr + (QK_K / 2) * blocks_per_expert;
+        auto * scales_ptr = qh_ptr + (QK_K / 8) * blocks_per_expert;
+        auto * dm_ptr     = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert);
+
+        for (int j = 0; j < QK_K / 2; ++j) {
+            qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
+        }
+        for (int j = 0; j < QK_K / 8; ++j) {
+            qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
+        }
+        for (int j = 0; j < K_SCALE_SIZE; ++j) {
+            scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
+        }
+        dm_ptr[ib] = x[ib].dm;
+    });
+    if (!g_ggml_sycl_use_async_mem_op) {
+        reorder_event.wait_and_throw();
+    }
+    return true;
+}
+
+// Reorder each Q6_K expert slice into [ql][qh][scales][d].
+static bool reorder_qw_q6_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
+    GGML_ASSERT(expert_bytes % sizeof(block_q6_K) == 0);
+    const int    blocks_per_expert = (int) (expert_bytes / sizeof(block_q6_K));
+    const size_t total_bytes       = expert_bytes * (size_t) n_expert;
+
+    sycl_reorder_temp_buffer tmp(stream, total_bytes);
+    if (!tmp) {
+        GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
+        return false;
+    }
+    uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
+
+    sycl::event copy_event;
+    SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
+    if (!g_ggml_sycl_use_async_mem_op) {
+        copy_event.wait();
+    }
+
+    const int total_blocks = blocks_per_expert * (int) n_expert;
+    auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
+        const int          gb   = gb_;
+        const int          e    = gb / blocks_per_expert;
+        const int          ib   = gb % blocks_per_expert;
+        const block_q6_K * x    = (const block_q6_K *) (tmp_buf + (size_t) e * expert_bytes);
+        uint8_t *          base = data_device + (size_t) e * expert_bytes;
+
+        auto * ql_ptr     = base;
+        auto * qh_ptr     = ql_ptr + (QK_K / 2) * blocks_per_expert;
+        auto * scales_ptr = qh_ptr + (QK_K / 4) * blocks_per_expert;
+        auto * d_ptr      = (sycl::half *) (scales_ptr + (QK_K / 16) * blocks_per_expert);
+
+        for (int j = 0; j < QK_K / 2; ++j) {
+            ql_ptr[ib * (QK_K / 2) + j] = x[ib].ql[j];
+        }
+        for (int j = 0; j < QK_K / 4; ++j) {
+            qh_ptr[ib * (QK_K / 4) + j] = x[ib].qh[j];
+        }
+        for (int j = 0; j < QK_K / 16; ++j) {
+            scales_ptr[ib * (QK_K / 16) + j] = x[ib].scales[j];
+        }
+        d_ptr[ib] = x[ib].d;
+    });
+    if (!g_ggml_sycl_use_async_mem_op) {
+        reorder_event.wait_and_throw();
+    }
+    return true;
+}
+
 static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
    GGML_ASSERT(size % sizeof(block_q3_K) == 0);
    GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
@@ -3840,6 +3983,22 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
    size_t nrows = src0->ne[1];
    size_t size = ggml_nbytes(src0);

+    // MoE expert weights are addressed per expert via nb[2], so each slice must
+    // remain self-contained after reorder.
+    if (src0->ne[2] > 1) {
+        GGML_ASSERT((size_t) size == (size_t) src0->ne[2] * src0->nb[2]);
+        switch (src0->type) {
+            case GGML_TYPE_Q4_K:
+                return reorder_qw_q4_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
+            case GGML_TYPE_Q5_K:
+                return reorder_qw_q5_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
+            case GGML_TYPE_Q6_K:
+                return reorder_qw_q6_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
+            default:
+                return false;
+        }
+    }
+
    switch (src0->type) {
        case GGML_TYPE_Q4_0:
            return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
@@ -3854,7 +4013,6 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
        case GGML_TYPE_Q6_K:
            return reorder_qw_q6_k(data_device, size, 0, stream);
        default:
-            GGML_ABORT("reorder_qw() called with unsupported type");
            return false;
    }
 }
@@ -3902,6 +4060,23 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
    }
 }

+// Lazily reorder supported MoE expert weights once their fused path is used.
+static void opt_for_reorder_id(ggml_backend_sycl_context * ctx, const ggml_tensor * src0) {
+    if (g_ggml_sycl_disable_optimize || !ctx->opt_feature.reorder) {
+        return;
+    }
+    if (src0->type != GGML_TYPE_Q4_K && src0->type != GGML_TYPE_Q5_K && src0->type != GGML_TYPE_Q6_K) {
+        return;
+    }
+    ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
+    if (!extra || extra->optimized_feature.reorder) {
+        return;
+    }
+    if (reorder_qw(src0, ctx->stream())) {
+        extra->optimized_feature.reorder = true;
+    }
+}
+

 static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    // The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
@@ -4067,11 +4242,6 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
    if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
    if (!ggml_is_contiguous(src1)) return false;

-    // Reorder layout not supported; fall back.
-    const ggml_tensor_extra_gpu * src0_extra =
-        static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
-    if (src0_extra && src0_extra->optimized_feature.reorder) return false;
-
    const int64_t n_ids_per_group = ids->ne[0];
    if (ids->ne[1] != 1) return false;
    if (ne11 != 1 && ne11 != n_ids_per_group) return false;
@@ -4081,16 +4251,37 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
    const int       n_experts_used   = (int) n_ids_per_group;
    const int       nrows            = (int) src0->ne[1];

+    // Lazily reorder the (Q4_K) expert weights into a per-expert SoA layout, then run the reorder
+    // GEMV. Placed after the bail checks so a non-dispatchable op does not pay the reorder cost.
+    opt_for_reorder_id(&ctx, src0);
+    const ggml_tensor_extra_gpu * src0_extra =
+        static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
+    const bool use_reorder = src0_extra && src0_extra->optimized_feature.reorder;
+
    ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
        (size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
    char * src1_ddq = src1_q8_alloc.get();
-    quantize_row_q8_1_sycl<quantize_q8_1>(
-        (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
-        src1_padded_cols, stream);
+    if (use_reorder) {
+        quantize_row_q8_1_sycl<quantize_and_reorder_q8_1_soa>(
+            (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
+            src1_padded_cols, stream);
+    } else {
+        quantize_row_q8_1_sycl<quantize_q8_1>(
+            (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
+            src1_padded_cols, stream);
+    }

    const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
    const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;

+    if (use_reorder) {
+        return ggml_sycl_mul_mat_vec_q_id_reorder(
+            src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
+            (float *) dst->data, (int) ne10, nrows, n_experts_used,
+            /*expert_weight_stride=*/ src0->nb[2],
+            /*dst_row_stride=*/ dst->nb[1],
+            src1_row_stride, stream);
+    }
    return ggml_sycl_mul_mat_vec_q_id(
        src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
        (float *) dst->data, (int) ne10, nrows, n_experts_used,
@@ -4489,6 +4680,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
                case GGML_UNARY_OP_EXP:
                    ggml_sycl_exp(ctx, dst);
                    break;
+                case GGML_UNARY_OP_EXPM1:
+                    ggml_sycl_expm1(ctx, dst);
+                    break;
                case GGML_UNARY_OP_SOFTPLUS:
                    ggml_sycl_softplus(ctx, dst);
                    break;
@@ -5138,6 +5332,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_EXPM1:
                case GGML_UNARY_OP_SOFTPLUS:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_CEIL:
@@ -5145,11 +5340,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_FLOOR:
                case GGML_UNARY_OP_ROUND:
                case GGML_UNARY_OP_TRUNC:
-#if defined (GGML_SYCL_F16)
-                    return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
-#else
-                    return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
-#endif
+                    return true;
                default:
                    return false;
            }
@@ -2468,3 +2468,118 @@ bool ggml_sycl_mul_mat_vec_q_id(
            return false;
    }
 }
+
+// Reorder (SoA) MoE expert GEMV: MoE expert/row/lane indexing (from mul_mat_vec_q_moe) with the
+// dense-reorder per-block reads (from mul_mat_vec_q_reorder). Each expert slice in vx_base is a
+// self-contained SoA, so nblocks = nrows*(ncols/qk) per expert and the constant expert stride holds.
+template <typename reorder_vec_dot_q_sycl>
+static void mul_mat_vec_q_moe_reorder(
+    const void * __restrict__ vx_base, const void * __restrict__ vy_base,
+    float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
+    const int ncols, const int nrows,
+    const size_t expert_weight_stride, const size_t dst_row_stride,
+    const size_t src1_row_stride,
+    const sycl::nd_item<3> & item_ct1) {
+    using block_type   = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
+    using block_traits = typename block_type::traits;
+
+    const int expert_idx = item_ct1.get_group(1);
+    const int i02        = ids_dev[expert_idx];
+
+    const char * vx  = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
+    const char * vy  = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
+    float *      dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
+    if (row >= nrows) {
+        return;
+    }
+
+    const auto sg = item_ct1.get_sub_group();
+
+    const int     blocks_per_row              = ncols / block_traits::qk;
+    constexpr int blocks_per_subgroup         = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
+    constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
+    const int     nblocks                     = nrows * (ncols / block_traits::qk);
+
+    static_assert(blocks_per_subgroup > 0);
+    static_assert(block_elements_per_subgroup > 0);
+
+    float partial_sum = 0.0f;
+    for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
+        const int ibx = row * blocks_per_row + i;
+
+        const auto bx_offset = block_type::get_block_offset(ibx, nblocks);
+        const auto d_offset  = block_type::get_d_offset(nrows, ncols, ibx);
+
+        const int           iby            = i * block_type::block_to_q8_1_ratio();
+        const int8_t *      q8_1_quant_ptr = (const int8_t *) vy + iby * QK8_1;
+        const sycl::half2 * q8_1_ds_ptr    = (const sycl::half2 *) ((const char *) vy + ncols + iby * sizeof(sycl::half2));
+
+#pragma unroll
+        for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
+            const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
+            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
+        }
+    }
+
+    auto sum = sycl::reduce_over_group(sg, partial_sum, std::plus<>());
+    if (sg.leader()) {
+        dst[row] = sum;
+    }
+}
+
+template <typename reorder_vec_dot_q_sycl>
+static void launch_mul_mat_vec_q_moe_reorder(
+    const void * vx_base, const void * vy, const int32_t * ids_dev,
+    float * dst_base, const int ncols, const int nrows, const int n_experts_used,
+    const size_t expert_weight_stride, const size_t dst_row_stride,
+    const size_t src1_row_stride,
+    dpct::queue_ptr stream) {
+    const int            block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl>(
+                    vx_base, vy, dst_base, ids_dev, ncols, nrows,
+                    expert_weight_stride, dst_row_stride, src1_row_stride, item);
+            });
+    });
+}
+
+bool ggml_sycl_mul_mat_vec_q_id_reorder(
+    enum ggml_type     src0_type,
+    const void *       vx_base,
+    const void *       vy,
+    const int32_t *    ids_dev,
+    float *            dst_base,
+    int                ncols,
+    int                nrows,
+    int                n_experts_used,
+    size_t             expert_weight_stride,
+    size_t             dst_row_stride,
+    size_t             src1_row_stride,
+    dpct::queue_ptr    stream) {
+    switch (src0_type) {
+        case GGML_TYPE_Q4_K:
+            launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q5_K:
+            launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q6_K:
+            launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        default:
+            return false;
+    }
+}
@@ -40,4 +40,21 @@ bool ggml_sycl_mul_mat_vec_q_id(
    size_t             src1_row_stride,      // 0 = shared src1, else per-expert stride in bytes
    dpct::queue_ptr    stream);

+// Reorder (SoA) variant of the fused MoE expert GEMV.
+// vx_base: each expert slice (stride expert_weight_stride == src0->nb[2]) is a self-contained reorder/SoA layout.
+// vy: src1 quantized with quantize_and_reorder_q8_1_soa (per-row SoA). Returns false if src0_type isn't handled.
+bool ggml_sycl_mul_mat_vec_q_id_reorder(
+    enum ggml_type     src0_type,
+    const void *       vx_base,
+    const void *       vy,
+    const int32_t *    ids_dev,
+    float *            dst_base,
+    int                ncols,
+    int                nrows,
+    int                n_experts_used,
+    size_t             expert_weight_stride,
+    size_t             dst_row_stride,
+    size_t             src1_row_stride,
+    dpct::queue_ptr    stream);
+
 #endif // GGML_SYCL_MMVQ_HPP
@@ -902,14 +902,17 @@ struct vk_device_struct {
    vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
    vk_pipeline pipeline_timestep_embedding_f32;
    vk_pipeline pipeline_conv_transpose_1d_f32;
+    vk_pipeline pipeline_col2im_1d_f32;
+    vk_pipeline pipeline_col2im_1d_f16;
+    vk_pipeline pipeline_col2im_1d_bf16;
    vk_pipeline pipeline_snake_f32;
    vk_pipeline pipeline_snake_f16;
    vk_pipeline pipeline_snake_bf16;
    vk_pipeline pipeline_pool2d_f32;
    vk_pipeline pipeline_rwkv_wkv6_f32;
    vk_pipeline pipeline_rwkv_wkv7_f32;
-    // [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128
-    vk_pipeline pipeline_gated_delta_net[3][2];
+    // [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128
+    vk_pipeline pipeline_gated_delta_net[4][2];
    vk_pipeline pipeline_ssm_scan_f32_d128;
    vk_pipeline pipeline_ssm_scan_f32_d256;
    vk_pipeline pipeline_ssm_conv_f32;
@@ -1552,6 +1555,16 @@ struct vk_op_timestep_embedding_push_constants {
    uint32_t max_period;
 };

+struct vk_op_col2im_1d_push_constants {
+    uint32_t T_out;
+    uint32_t OC;
+    uint32_t K_OC;
+    uint32_t T_in;
+    uint32_t K;
+    int32_t  stride;
+    int32_t  p0;
+};
+
 struct vk_op_conv_transpose_1d_push_constants {
    uint32_t Cout;
    uint32_t Cin;
@@ -3067,8 +3080,10 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
                                                       vk::MemoryPropertyFlagBits::eDeviceLocal});
        } else if (device->uma) {
-            // Fall back to host memory type
-            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
+            // On UMA, prefer host-visible memory so direct tensor borrowing works.
+            // If unavailable, fall back to device-local memory.
+            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
+                                                       vk::MemoryPropertyFlagBits::eDeviceLocal,
                                                       vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
        } else if (device->disable_host_visible_vidmem) {
            if (device->allow_sysmem_fallback) {
@@ -5203,6 +5218,9 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
    ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f32,  "col2im_1d_f32",  col2im_1d_f32_len,  col2im_1d_f32_data,  "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f16,  "col2im_1d_f16",  col2im_1d_f16_len,  col2im_1d_f16_data,  "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_bf16, "col2im_1d_bf16", col2im_1d_bf16_len, col2im_1d_bf16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);

    ggml_vk_create_pipeline(device, device->pipeline_snake_f32,  "snake_f32",  snake_f32_len,  snake_f32_data,  "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_snake_f16,  "snake_f16",  snake_f16_len,  snake_f16_data,  "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
@@ -5215,14 +5233,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
    ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);

    {
-        const uint32_t gdn_sizes[] = {32, 64, 128};
+        const uint32_t gdn_sizes[] = {16, 32, 64, 128};
        const char * gdn_names[][2] = {
+            {"gated_delta_net_f32_d16",     "gated_delta_net_f32_d16_kda"},
            {"gated_delta_net_f32_d32",     "gated_delta_net_f32_d32_kda"},
            {"gated_delta_net_f32_d64",     "gated_delta_net_f32_d64_kda"},
            {"gated_delta_net_f32_d128",    "gated_delta_net_f32_d128_kda"},
        };
-        const bool use_subgroup_reduce = device->subgroup_arithmetic;
-        for (uint32_t si = 0; si < 3; si++) {
+        for (uint32_t si = 0; si < 4; si++) {
            const uint32_t S_V = gdn_sizes[si];
            GGML_ASSERT(is_pow2(S_V));

@@ -5236,10 +5254,29 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
                lanes_per_column = std::min(S_V, device->subgroup_size);
            }

-            const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size);
+            // gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and
+            // S_V % LANES_PER_COLUMN == 0 to avoid bounds checks.
+            while (lanes_per_column > 1u) {
+                const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 &&
+                                         (S_V % lanes_per_column) == 0;
+                const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0;
+                if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) {
+                    break;
+                }
+                lanes_per_column >>= 1u;
+            }
+
+            GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0);
+            GGML_ASSERT((S_V % lanes_per_column) == 0);
+            GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0);
+
+            const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size;
+            const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce;
+            const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce;
+            const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce;
            size_t gdn_len;
            const void * gdn_data;
-            if (use_subgroup_reduce && need_clustered_shader) {
+            if (use_clustered_reduce) {
                gdn_len = gated_delta_net_f32_len;
                gdn_data = (const void *)gated_delta_net_f32_data;
            } else if (use_subgroup_reduce) {
@@ -5256,7 +5293,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
            for (uint32_t kda = 0; kda < 2; kda++) {
                ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda],
                    gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants),
-                    wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size);
+                    wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size);
            }
        }
    }
@@ -10702,6 +10739,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            return ctx->device->pipeline_conv_transpose_1d_f32;
        }
        return nullptr;
+    case GGML_OP_COL2IM_1D:
+        switch (src0->type) {
+            case GGML_TYPE_F32:  return ctx->device->pipeline_col2im_1d_f32;
+            case GGML_TYPE_F16:  return ctx->device->pipeline_col2im_1d_f16;
+            case GGML_TYPE_BF16: return ctx->device->pipeline_col2im_1d_bf16;
+            default:             return nullptr;
+        }
    case GGML_OP_POOL_2D:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_pool2d_f32;
@@ -10723,9 +10767,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0;
            uint32_t si;
            switch (S_v) {
-                case 32:  si = 0; break;
-                case 64:  si = 1; break;
-                case 128: si = 2; break;
+                case 16:  si = 0; break;
+                case 32:  si = 1; break;
+                case 64:  si = 2; break;
+                case 128: si = 3; break;
                default: return nullptr;
            }
            return ctx->device->pipeline_gated_delta_net[si][kda];
@@ -11147,6 +11192,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
        {
            elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
        } break;
+    case GGML_OP_COL2IM_1D:
+        {
+            elements = { uint32_t(dst->ne[0]), uint32_t(dst->ne[1]), 1 };
+        } break;
    case GGML_OP_POOL_2D:
        {
            const uint32_t N = dst->ne[3];
@@ -12936,6 +12985,32 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
 }

+static void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    // src0: [K_OC, T_in] columns from matmul
+    // dst:  [T_out, OC]
+
+    const int32_t stride = dst->op_params[0];
+    const int32_t oc     = dst->op_params[1];
+    const int32_t p0     = dst->op_params[2];
+
+    const uint32_t K_OC  = static_cast<uint32_t>(src0->ne[0]);
+    const uint32_t T_in  = static_cast<uint32_t>(src0->ne[1]);
+    const uint32_t T_out = static_cast<uint32_t>(dst->ne[0]);
+    const uint32_t OC    = static_cast<uint32_t>(oc);
+    const uint32_t K     = K_OC / OC;
+
+    vk_op_col2im_1d_push_constants p{};
+    p.T_out  = T_out;
+    p.OC     = OC;
+    p.K_OC   = K_OC;
+    p.T_in   = T_in;
+    p.K      = K;
+    p.stride = stride;
+    p.p0     = p0;
+
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COL2IM_1D, std::move(p));
+}
+
 // Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
 // Match the naive mul -> sin -> sqr -> mul -> add chain and run the
 // dedicated kernel directly. The pattern is validated by
@@ -14423,6 +14498,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_TIMESTEP_EMBEDDING:
        ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);

+        break;
+    case GGML_OP_COL2IM_1D:
+        ggml_vk_col2im_1d(ctx, compute_ctx, src0, node);
+
        break;
    case GGML_OP_CONV_TRANSPOSE_1D:
        ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node);
@@ -17136,7 +17215,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_GATED_DELTA_NET:
            {
                const uint32_t S_v = op->src[2]->ne[0];
-                if (S_v != 32 && S_v != 64 && S_v != 128) {
+                if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) {
                    return false;
                }
                for (int i = 0; i < 6; i++) {
@@ -17188,6 +17267,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            return op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_CONV_TRANSPOSE_1D:
            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_COL2IM_1D:
+            return (op->src[0]->type == GGML_TYPE_F32 ||
+                    op->src[0]->type == GGML_TYPE_F16 ||
+                    op->src[0]->type == GGML_TYPE_BF16) &&
+                   op->type == op->src[0]->type &&
+                   ggml_is_contiguous(op->src[0]) &&
+                   ggml_is_contiguous(op);
        case GGML_OP_CONV_2D:
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
@@ -18019,6 +18105,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
            const int32_t p0 = tensor->op_params[1];
            const int32_t d0 = tensor->op_params[2];
            tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
+        } else if (tensor->op == GGML_OP_COL2IM_1D) {
+            const int32_t stride = tensor->op_params[0];
+            const int32_t oc     = tensor->op_params[1];
+            const int32_t p0     = tensor->op_params[2];
+            tensor_clone = ggml_col2im_1d(ggml_ctx, src_clone[0], stride, oc, p0);
        } else if (tensor->op == GGML_OP_POOL_2D) {
            enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
            const int32_t k0 = tensor->op_params[1];
@@ -0,0 +1,61 @@
+#version 450
+
+#include "types.glsl"
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};   // columns: [K_OC, T_in]
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};  // output:  [T_out, OC]
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (push_constant) uniform parameter {
+    uint32_t T_out;
+    uint32_t OC;
+    uint32_t K_OC;
+    uint32_t T_in;
+    uint32_t K;
+    int32_t  stride;
+    int32_t  p0;
+} p;
+
+// Load A_TYPE to float
+float load_col(uint32_t idx) {
+#if defined(DATA_A_BF16)
+    return bf16_to_fp32(uint32_t(data_a[idx]));
+#else
+    return float(data_a[idx]);
+#endif
+}
+
+// Store float as D_TYPE
+void store_dst(uint32_t idx, float v) {
+#if defined(DATA_A_BF16)
+    data_d[idx] = D_TYPE(fp32_to_bf16(v));
+#else
+    data_d[idx] = D_TYPE(v);
+#endif
+}
+
+void main() {
+    const uint32_t t_out = gl_GlobalInvocationID.x;
+    const uint32_t oc    = gl_GlobalInvocationID.y;
+    if (t_out >= p.T_out || oc >= p.OC) return;
+
+    const int32_t t_abs = int32_t(t_out) + p.p0; // absolute position in uncropped signal
+
+    // Gather: only the ceil(K/stride) columns that scatter into t_abs, no modulo
+    int32_t t_in_min = (t_abs - int32_t(p.K) + p.stride) / p.stride;
+    if (t_in_min < 0) t_in_min = 0;
+    int32_t t_in_max = t_abs / p.stride;
+    if (t_in_max >= int32_t(p.T_in)) t_in_max = int32_t(p.T_in) - 1;
+
+    float val = 0.0;
+    for (int32_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
+        int32_t k = t_abs - t_in * p.stride;
+        // col layout: [K_OC, T_in], column index = oc * K + k
+        uint32_t col_idx = (oc * p.K + uint32_t(k)) + uint32_t(t_in) * p.K_OC;
+        val += load_col(col_idx);
+    }
+
+    // dst layout: [T_out, OC], element (t_out, oc) = t_out + oc * T_out
+    store_dst(t_out + oc * p.T_out, val);
+}
@@ -1003,6 +1003,9 @@ void process_shaders() {
    string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));

    string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"},  {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("col2im_1d_f32",  "col2im_1d.comp", {{"DATA_A_F32", "1"},  {"A_TYPE", "float"},     {"D_TYPE", "float"}});
+    string_to_spv("col2im_1d_f16",  "col2im_1d.comp", {{"DATA_A_F16", "1"},  {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("col2im_1d_bf16", "col2im_1d.comp", {{"DATA_A_BF16", "1"}, {"A_TYPE", "uint16_t"},  {"D_TYPE", "uint16_t"}});

    string_to_spv("snake_f32",  "snake.comp", {{"DATA_A_F32", "1"},  {"A_TYPE", "float"},     {"D_TYPE", "float"}});
    string_to_spv("snake_f16",  "snake.comp", {{"DATA_A_F16", "1"},  {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
@@ -129,7 +129,7 @@ void test_gbnf_generation(testing &t) {
        });

        assert_gbnf_equal(t, R"""(
-            root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
+            root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])* ("<" | "</" | "</t" | "</ta" | "</tag")?
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )""", gbnf);
    });
@@ -1882,11 +1882,29 @@ static void test_lfm2_parser(const std::string & template_path, bool detailed_de
        .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
        .run();

-    // Python tool with multiline code in string
+    // Python tool with multiline code in string: the \n in the literal decodes to a real
+    // newline, emitted as a JSON \n escape (not a doubled backslash).
    tst.test("<|tool_call_start|>[python(code=\"def hello():\\n    print('hey')\")]<|tool_call_end|>")
        .tools({ python_tool })
        .expect_tool_calls({
-            { "python", R"#({"code": "def hello():\\n    print('hey')"})#", "" }
+            { "python", R"#({"code": "def hello():\n    print('hey')"})#", "" }
+        })
+        .run();
+
+    // String escape sequences decode to their actual characters (newline + tab here),
+    // so a "write a two line file" style call produces real line breaks, not literal "\n".
+    tst.test("<|tool_call_start|>[python(code=\"First line\\nSecond line\\tindented\")]<|tool_call_end|>")
+        .tools({ python_tool })
+        .expect_tool_calls({
+            { "python", R"#({"code": "First line\nSecond line\tindented"})#", "" }
+        })
+        .run();
+
+    // Escaped quotes inside a string argument survive the round-trip.
+    tst.test("<|tool_call_start|>[python(code=\"print(\\\"hi\\\")\")]<|tool_call_end|>")
+        .tools({ python_tool })
+        .expect_tool_calls({
+            { "python", R"#({"code": "print(\"hi\")"})#", "" }
        })
        .run();

@@ -2024,6 +2042,61 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            })
            .run();

+        tst.test(
+               "<tool_call>\n"
+               "<function=edit>\n"
+               "<parameter=filename>\n"
+               "foo.c\n"
+               "</parameter>\n"
+               "<parameter=oldString>\n"
+               "#iclunde\n"
+               "</parameter>\n"
+               "<parameter=newString>\n"
+               "#include\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .enable_thinking(false)
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({
+                edit_tool
+        })
+            .expect_tool_calls({
+                { "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\"}", {} },
+            })
+            .run();
+
+        // a parameter value that itself ends in a newline (e.g. a source file with a
+        // trailing newline). The structural delimiter is "\n</parameter>\n", so the value
+        // "#include\n" renders as "...#include\n\n</parameter>\n". The trailing newline must
+        // be preserved faithfully (no stripping), and the generated grammar must admit a
+        // value ending on a delimiter prefix. Regression test for gbnf_excluding_pattern.
+        tst.test(
+               "<tool_call>\n"
+               "<function=edit>\n"
+               "<parameter=filename>\n"
+               "foo.c\n"
+               "</parameter>\n"
+               "<parameter=oldString>\n"
+               "#iclunde\n"
+               "</parameter>\n"
+               "<parameter=newString>\n"
+               "#include\n"
+               "\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .enable_thinking(false)
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({
+                edit_tool
+        })
+            .expect_tool_calls({
+                { "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\\n\"}", {} },
+            })
+            .run();
+
+
        // test code that starts with indent
        tst.test(
               "<tool_call>\n"
@@ -323,6 +323,7 @@ struct cmd_params {
    std::vector<std::string>         hf_repo;
    std::vector<std::string>         hf_file;
    std::string                      hf_token;
+    bool                             offline;
    std::vector<int>                 n_prompt;
    std::vector<int>                 n_gen;
    std::vector<std::pair<int, int>> n_pg;
@@ -367,6 +368,7 @@ static const cmd_params cmd_params_defaults = {
    /* hf_repo              */ {},
    /* hf_file              */ {},
    /* hf_token             */ "",
+    /* offline              */ false,
    /* n_prompt             */ { 512 },
    /* n_gen                */ { 128 },
    /* n_pg                 */ {},
@@ -437,6 +439,8 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("                                              (default: unused)\n");
    printf("  -hft, --hf-token <token>                    Hugging Face access token\n");
    printf("                                              (default: value from HF_TOKEN environment variable)\n");
+    printf("  --offline                                   Offline mode: forces use of cache, prevents network access\n");
+    printf("                                              (default: disabled)\n");
    printf("  -p, --n-prompt <n>                          (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
    printf("  -n, --n-gen <n>                             (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
    printf("  -pg <pp,tg>                                 (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
@@ -558,6 +562,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                    break;
                }
                params.hf_token = argv[i];
+            } else if (arg == "--offline") {
+                params.offline = true;
            } else if (arg == "-p" || arg == "--n-prompt") {
                if (++i >= argc) {
                    invalid_param = true;
@@ -1040,6 +1046,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {

            common_download_opts opts;
            opts.bearer_token = params.hf_token;
+            opts.offline         = params.offline;
            auto download_result = common_download_model(model, opts);
            if (download_result.model_path.empty()) {
                fprintf(stderr, "error: failed to download model from HuggingFace\n");
@@ -247,7 +247,9 @@ int32_t mtmd_helper_decode_image_chunk(
        llama_pos n_past,
        llama_seq_id seq_id,
        int32_t n_batch,
-        llama_pos * new_n_past) {
+        llama_pos * new_n_past,
+        mtmd_helper_post_decode_callback callback,
+        void * user_data) {
    GGML_ASSERT(n_batch > 0);
    auto chunk_type = mtmd_input_chunk_get_type(chunk);
    const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
@@ -302,10 +304,23 @@ int32_t mtmd_helper_decode_image_chunk(
        int32_t ret = llama_decode(lctx, batch_embd_view);
        if (ret != 0) {
            LOG_ERR("failed to decode %s\n", name);
-            llama_set_causal_attn(lctx, true); // restore causal attn
+            if (use_non_causal) {
+                llama_set_causal_attn(lctx, true);
+            }
            return ret;
        }

+        if (callback != nullptr) {
+            ret = callback(batch_embd_view, user_data);
+            if (ret != 0) {
+                LOG_ERR("post-decode callback failed\n");
+                if (use_non_causal) {
+                    llama_set_causal_attn(lctx, true);
+                }
+                return ret;
+            }
+        }
+
        LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);

        i_batch++;
@@ -379,7 +394,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
        LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);

        float * embd = mtmd_get_output_embd(ctx);
-        ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
+        ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past, nullptr, nullptr);
        if (ret != 0) {
            LOG_ERR("failed to decode %s\n", name);
            llama_batch_free(text_batch);
@@ -91,6 +91,8 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                               bool logits_last,
                                               llama_pos * new_n_past);

+typedef int32_t (*mtmd_helper_post_decode_callback)(struct llama_batch batch, void * user_data);
+
 // helper function to decode an image whose embeddings have already been calculated
 // this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
 // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
@@ -101,7 +103,9 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
                                                llama_pos n_past,
                                                llama_seq_id seq_id,
                                                int32_t n_batch,
-                                                llama_pos * new_n_past);
+                                                llama_pos * new_n_past,
+                                                mtmd_helper_post_decode_callback callback,
+                                                void * user_data);

 //
 // video input helpers (requires ffmpeg/ffprobe installed on the system)
@@ -96,16 +96,15 @@ struct mtmd_image_tokens {
            // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
            return (nx + 1) * ny + 2;
        }
-        // [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
-        if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
-            return nx * ny;
-        }
        uint32_t nz = batch_f32.entries.size();
-        // TODO: simplify this by repeating the last frame until it fits the temporal merge
-        if (nz % n_temporal_merge != 0) {
-            nz = nz / n_temporal_merge + 1;
-        } else {
-            nz = nz / n_temporal_merge;
+        if (n_temporal_merge > 1) {
+            // [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
+            // TODO: simplify this by repeating the last frame until it fits the temporal merge
+            if (nz % n_temporal_merge != 0) {
+                nz = nz / n_temporal_merge + 1;
+            } else {
+                nz = nz / n_temporal_merge;
+            }
        }
        return nx * ny * nz;
    }
@@ -40,6 +40,7 @@ def main(args_in: list[str] | None = None) -> None:
                        required=True)
    parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
    parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
+    parser.add_argument("--offline", action="store_true", default=False, help="Offline mode: forces use of cache, prevents network access")
    parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
    parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
    parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
@@ -268,6 +269,8 @@ def start_server_background(args):
    ]
    server_args.extend(['--hf-repo', args.hf_repo])
    server_args.extend(['--hf-file', args.hf_file])
+    if args.offline:
+        server_args.append('--offline')
    server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
    server_args.extend(['--ctx-size', args.ctx_size])
    server_args.extend(['--parallel', args.parallel])
@@ -539,37 +539,6 @@ bool server_tokens::validate(const struct llama_context * ctx) const {
    return true;
 }

-int32_t server_tokens::process_chunk(
-            llama_context * ctx,
-            mtmd_context * mctx,
-            size_t idx,
-            llama_pos pos,
-            int32_t seq_id,
-            size_t & n_tokens_out) const {
-    const auto & chunk = find_chunk(idx);
-    const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
-                        ? "image" : "audio";
-    SRV_INF("processing %s...\n", name);
-    int32_t n_batch = llama_n_batch(ctx);
-    int64_t t0 = ggml_time_ms();
-    llama_pos new_n_past; // unused for now
-    int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
-        chunk.get(),
-        pos,
-        seq_id,
-        n_batch,
-        true, // logits last
-        &new_n_past);
-    SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
-    if (result != 0) {
-        LOG_ERR("mtmd_helper_eval failed with status %d", result);
-        n_tokens_out = 0;
-        return result;
-    }
-    n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
-    return 0;
-}
-
 server_tokens server_tokens::clone() const {
    server_tokens res;
    res.has_mtmd = has_mtmd;
@@ -221,15 +221,6 @@ public:
    // make sure all text tokens are within the vocab range
    bool validate(const struct llama_context * ctx) const;

-    // encode and decode the image chunk
-    int32_t process_chunk(
-                llama_context * ctx,
-                mtmd_context * mctx,
-                size_t idx,
-                llama_pos pos,
-                int32_t seq_id,
-                size_t & n_tokens_out) const;
-
    server_tokens clone() const;
 };

@@ -15,11 +15,6 @@
 #include "mtmd.h"
 #include "mtmd-helper.h"

-#include "ggml-cpp.h"
-
-// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING]
-#include "../../src/llama-ext.h"
-
 #include <algorithm>
 #include <cstddef>
 #include <cinttypes>
@@ -81,7 +76,6 @@ struct server_slot {
    // multimodal
    mtmd_context * mctx = nullptr;
    mtmd::batch_ptr mbatch = nullptr;
-    std::array<llama_context *, 2> mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context

    // speculative decoding
    common_speculative * spec;
@@ -207,6 +201,8 @@ struct server_slot {
    // Speculative decoding stats
    int32_t n_draft_total = 0;      // Total draft tokens generated
    int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
+    int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model
+    std::vector<int32_t> n_accepted_per_pos; // Accepted tokens per draft position

    void reset() {
        SLT_DBG(*this, "%s", "\n");
@@ -233,6 +229,8 @@ struct server_slot {
        // clear speculative decoding stats
        n_draft_total = 0;
        n_draft_accepted = 0;
+        n_draft_verif_steps = 0;
+        n_accepted_per_pos.clear();

        task_prev = std::move(task);
        task.reset();
@@ -244,15 +242,6 @@ struct server_slot {

        // clear multimodal state
        mbatch.reset();
-        mtgt[0] = ctx_tgt;
-        mtgt[1] = nullptr;
-        if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) {
-            // TODO: in the future, figure out how to infuse target embeddings to the images
-            //       for now, we re-decode the same chunk in both ctx_tgt and ctx_dft
-            //       maybe we simply need to call `common_speculative_process()` ?
-            //       [TAG_MTMD_DRAFT_PROCESSING]
-            mtgt[1] = ctx_dft;
-        }
    }

    void init_sampler() const {
@@ -524,10 +513,22 @@ struct server_slot {
                llama_perf_context(ctx_tgt).n_reused);

        if (n_draft_total > 0) {
-            const float draft_ratio = (float) n_draft_accepted / n_draft_total;
+            const float  draft_ratio  = (float) n_draft_accepted / n_draft_total;
+            const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0;
+
+            std::string acceptance_rates_per_pos;
+            if (n_draft_verif_steps > 0) {
+                for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) {
+                    if (i > 0) {
+                        acceptance_rates_per_pos += ", ";
+                    }
+                    acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps);
+                }
+            }
+
            SLT_INF(*this,
-                    "draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
-                    draft_ratio, n_draft_accepted, n_draft_total);
+                    "draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n",
+                    draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str());
        }

        common_speculative_print_stats(spec);
@@ -598,32 +599,38 @@ struct server_slot {
    int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) {
        GGML_ASSERT(mctx);
        const auto & input_tokens = task->tokens;
-        auto & chunk = input_tokens.find_chunk(idx);
+        const auto & chunk = input_tokens.find_chunk(idx);
        int32_t res = 0;

        auto try_decode = [&]() -> int32_t {
            if (mbatch) {
                float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get());
                if (embd) {
-                    for (auto * lctx : mtgt) {
-                        if (lctx == nullptr) {
-                            continue;
-                        }
-                        llama_pos new_n_past; // unused for now
-                        res = mtmd_helper_decode_image_chunk(
-                            mctx,
-                            lctx,
-                            chunk.get(),
-                            embd,
-                            prompt.tokens.pos_next(),
-                            id,
-                            llama_n_batch(lctx),
-                            &new_n_past
-                        );
-                        if (res != 0) {
-                            SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
-                            return -1;
+                    void * cb_data = spec;
+                    static auto cb = [](llama_batch batch, void * user_data) {
+                        common_speculative * spec = static_cast<common_speculative *>(user_data);
+                        if (!common_speculative_process(spec, batch)) {
+                            return 1;
                        }
+                        return 0;
+                    };
+
+                    llama_pos new_n_past; // unused for now
+                    res = mtmd_helper_decode_image_chunk(
+                        mctx,
+                        ctx_tgt,
+                        chunk.get(),
+                        embd,
+                        prompt.tokens.pos_next(),
+                        id,
+                        llama_n_batch(ctx_tgt),
+                        &new_n_past,
+                        cb,
+                        cb_data
+                    );
+                    if (res != 0) {
+                        SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
+                        return -1;
                    }
                    n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
                    return 0; // success
@@ -636,7 +643,8 @@ struct server_slot {
        res = try_decode();
        if (res == 0) {
            return 0;
-        } else if (res < 0) {
+        }
+        if (res < 0) {
            // fatal error
            return res;
        }
@@ -3350,48 +3358,6 @@ private:
            // TODO: avoid restoring the draft context and re-evaluating the drafted tokens when not needed [TAG_SPEC_AVOID_DRAFT_REEVAL]
            //       for now, always re-evaluate for simplicity
            //       ref: https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4400925384
-            //
-            // | spec type   | need re-eval |
-            // | ---         | ---          |
-            // | draft model | no           | because the draft model does not use embeddings from the target
-            // | MTP (std)   | yes          |
-            // | MTP Gemma4  | no           | because the KV cache is shared
-            // | Eagle3      | yes          |
-            // | DFlash      | yes          | https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4405406982
-            //
-            // note: this logic is now moved in `common_speculative_process()`
-            //       keeping the sketch here until for a bit, until the logic is finalized
-            //
-            //if (ctx_dft) {
-            //    // TODO: update as needed for MTP, Eagle3, etc.
-            //    const bool need_tgt_embd = false;
-
-            //    if (need_tgt_embd) {
-            //        llama_synchronize(ctx_tgt);
-            //    }
-
-            //    // the logic here varies depending on the speculative decoding method
-            //    //  - some draft contexts require embeddings from the target context, others don't
-            //    //  - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
-            //    // TODO: extract this in a function ?
-            //    {
-            //        // TODO: hook the embeddings from the last target batch here
-            //        if (llama_model_has_encoder(model_dft.get())) {
-            //            //llama_encode(ctx_dft, ...);
-
-            //            GGML_ABORT("not implemented yet\n");
-            //        }
-
-            //        const int ret = llama_decode(ctx_dft.get(), batch_view);
-
-            //        if (ret != 0) {
-            //            SRV_ERR("failed to decode draft batch, ret = %d\n", ret);
-
-            //            // TODO: handle error
-            //            break;
-            //        }
-            //    }
-            //}
            if (!common_speculative_process(spec.get(), batch_view)) {
                SRV_ERR("%s", "failed to process speculative batch\n");

@@ -3593,6 +3559,14 @@ private:

                // update how many tokens out of those tested were accepted
                slot.n_draft_accepted += ids.size() - 1;
+                slot.n_draft_verif_steps += 1;
+
+                if (slot.n_accepted_per_pos.empty()) {
+                    slot.n_accepted_per_pos.resize(common_speculative_n_max(&params_base.speculative), 0);
+                }
+                for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) {
+                    slot.n_accepted_per_pos[i]++;
+                }

                // add accepted tokens to the prompt
                slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
Author	SHA1	Message	Date
Winston Ma	32120c10e3	vulkan: prefer host-visible memory buffers on UMA devices (#22930 ) * implement UMA host-visible memory * update based on 0cc4m's suggestion	2026-06-16 09:36:52 +02:00
Jeff Bolz	d5fb104293	vulkan: Support gated_delta_net with S_v=16 (#24581 )	2026-06-16 09:26:57 +02:00
Ruixiang Wang	635b65ad7a	spec: add spec metrics mean acceptance length and acceptance rate per position (#24536 ) * spec: add spec metrics mean acceptance length and acceptance per pos * fix as suggestion Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix as suggestion Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix as suggestion Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix as suggestions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-06-16 10:23:09 +03:00
Adrien Gallouët	e3a74b2990	bench : add --offline (#24511 ) * bench : add --offline Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Add default Signed-off-by: Adrien Gallouët <angt@huggingface.co> --------- Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-16 08:26:05 +02:00
Frosty40	ac79caa7ce	sycl: support reordered Q4_K/Q5_K/Q6_K MoE MUL_MAT_ID (#24452 ) * sycl: support reordered Q4_K and Q5_K MoE MUL_MAT_ID Extend reordered-weight handling to fused MoE MUL_MAT_ID for Q4_K and Q5_K expert tensors and add Q5_K reordered DMMV coverage. Unsupported 3D reorder cases now fall back instead of aborting. * sycl: extend MoE reorder to Q6_K mul_mat_id	2026-06-16 08:35:00 +03:00
Neo Zhang	fdd109883d	[SYCL] Support OP EXPM1, support all UT cases of FLOOR, TRUNC, ROUND (#24363 ) * support OP EXPM1, support all UT cases of FLOOR, TRUNC, ROUND * fix conflict * rebase, support new UT case of repeat, concat	2026-06-16 08:34:29 +03:00
Todd Malsbary	4196b477da	sycl : Make GGML_SYCL_F16=ON the default (#23996 ) * Add -cl-fp32-correctly-rounded-divide-sqrt to F16=ON builds Signed-off-by: Todd Malsbary <todd.malsbary@intel.com> * Make GGML_SYCL_F16=ON the default Signed-off-by: Todd Malsbary <todd.malsbary@intel.com> * Leave F32 the default F16 remains explictly set for example and Dockerfile builds. Signed-off-by: Todd Malsbary <todd.malsbary@intel.com> * Revert changes to examples/sycl/build scripts Signed-off-by: Todd Malsbary <todd.malsbary@intel.com> --------- Signed-off-by: Todd Malsbary <todd.malsbary@intel.com>	2026-06-16 08:34:02 +03:00
Pascal	ad39ccaa19	vulkan: add col2im_1d op (#24425 ) * vulkan: add GGML_OP_COL2IM_1D, follow-up to the CPU op * vulkan: col2im_1d bounded gather loop instead of full-K scan with modulo * vulkan: col2im_1d address review from @jeffbolznv * vulkan: col2im_1d return nullptr for unsupported types, address review from @0cc4m	2026-06-16 06:34:43 +02:00
Tarek Dakhran	7dad2f1a17	chat : fix LFM2 tool-call parsing double-escaping (#24667 ) * Add escape test cases * chat : fix LFM2 tool-call parsing double-escaping	2026-06-15 22:10:09 +02:00
Xuan-Son Nguyen	e36a602ba3	mtmd: fix miscounting n_tokens (#24656 )	2026-06-15 18:07:14 +02:00
Piotr Wilkin (ilintar)	38d546330a	chat: include full unparsed prompt in debug (#24650 ) message on parse error	2026-06-15 17:33:54 +02:00
Julien Jerphanion	a1eb756c0b	docs: Add instructions to install `llama.cpp` from conda-forge (#22219 ) * docs: Add instructions to install `llama.cpp` from conda-forge Signed-off-by: Julien Jerphanion <git@jjerphan.xyz> * Rewording of instructions Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Signed-off-by: Julien Jerphanion <git@jjerphan.xyz> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-06-15 17:12:25 +02:00
Pascal	581e8eca8b	chat: harden peg-native tool call parsing (#24329 ) * chat: harden peg-native tool call parsing accept an optional leading type: function field in build_json_tools_flat_keys so openai style tool calls parse on templates whose serialization opens on the name field. return a clean error and log the unparsed fragment on a final peg parse failure instead of throwing the raw parser position and input. keep the raw arguments string in func_args_not_string when it is not valid json instead of aborting the prompt render. * chat: surface peg-native parse failures a final peg parse failure threw the raw parser position and input. log the unparsed fragment and raise a clearer error instead, so a model output that does not match the expected format no longer fails silently with an empty assistant turn. minimal change, no behavior change on successful parses. * chat: handle openai style tool calls in peg-native * nits * common: scope OpenAI wrapper grammar trigger via autoparser flag * chat: gate type:function parsing leniency on the analysis flag Thread accept_openai_wrapper from the generator to build_json_tools_flat_keys so the leading "type": "function" field is accepted only when openai_wrapper_trigger is set.	2026-06-15 15:37:04 +02:00
Piotr Wilkin (ilintar)	0ae3f450f0	chat: fix an "oldie but goodie" grammar generator bug that surfaced during last changes (#24653 ) * chat: fix an "oldie but goodie" grammar generator bug that surfaced during last changes * update erroneous case in PEG parser test	2026-06-15 15:27:47 +02:00
Georgi Gerganov	e3cab403bf	mtmd : add post-decode callback (#24645 ) Assisted-by: pi:llama.cpp/Qwen3.6-27B	2026-06-15 16:02:05 +03:00