[SYCL] rename GGML_SYCL_SUPPORT_LEVEL_ZERO (#24719 )

* rename GGML_SYCL_SUPPORT_LEVEL_ZERO to GGML_SYCL_SUPPORT_LEVEL_ZERO_API, and GGML_SYCL_ENABLE_LEVEL_ZERO to GGML_SYCL_USE_LEVEL_ZERO_API * fix code format * fix error when rebase
sycl : support MUL_MAT and OUT_PROD with Q1_0 (#24721 )
2026-06-18 11:47:40 +02:00 · 2026-06-18 11:18:26 +03:00 · 2026-06-18 11:17:37 +03:00 · 2026-06-18 09:57:59 +02:00 · 2026-06-18 09:32:56 +02:00 · 2026-06-18 09:40:03 +03:00
57 changed files with 3435 additions and 1769 deletions
@@ -46,11 +46,13 @@ jobs:

    steps:
      - id: check
+        env:
+          COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
        run: |
          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
            echo "should_release=true" >> $GITHUB_OUTPUT
          elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
-            if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
+            if echo "$COMMIT_MESSAGE" | grep -q '\[no release\]'; then
              echo "should_release=false" >> $GITHUB_OUTPUT
            else
              echo "should_release=true" >> $GITHUB_OUTPUT
@@ -542,6 +544,7 @@ jobs:
    steps:
      - name: Set OpenVINO version output
        id: openvino_version
+        shell: bash
        run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT

      - name: Clone
@@ -20,16 +20,21 @@ int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);

-// hands the update over to the install script, which downloads and swaps the binary
+// Self-update is only supported for binaries built with llama-install.sh
 static int llama_update(int argc, char ** argv) {
    (void) argc;
    (void) argv;

+#ifdef LLAMA_INSTALL_BUILD
 #if defined(_WIN32)
    return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
 #else
    return system("curl -fsSL https://llama.app/install.sh | sh");
 #endif
+#else
+    printf("Updates are available only when installed from https://llama.app\n");
+    return 1;
+#endif
 }

 static const char * progname;
@@ -46,21 +51,29 @@ struct command {
    int (*func)(int, char **);
 };

+#ifdef LLAMA_INSTALL_BUILD
+#define UPDATE_HIDDEN false
+#else
+#define UPDATE_HIDDEN true
+#endif
+
 static const command cmds[] = {
-    {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
-    {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
-    {"update",        "Update llama to the latest release",                 {},           false, llama_update       },
-    {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
-    {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
-    {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
-    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
-    {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
-    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
-    {"version",       "Show version",                                       {},           false, version            },
-    {"licenses",      "Show third-party licenses",                          {"credits"},  false, licenses           },
-    {"help",          "Show available commands",                            {},           false, help               },
+    {"serve",         "HTTP API server",                                    {"server"},   false,         llama_server       },
+    {"cli",           "Command-line interactive interface",                 {"client"},   false,         llama_cli          },
+    {"update",        "Update llama to the latest release",                 {},           UPDATE_HIDDEN, llama_update       },
+    {"completion",    "Text completion",                                    {"complete"}, true,          llama_completion   },
+    {"bench",         "Benchmark prompt processing and text generation",    {},           true,          llama_bench        },
+    {"batched-bench", "Benchmark batched decoding performance",             {},           true,          llama_batched_bench},
+    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,          llama_fit_params   },
+    {"quantize",      "Quantize a model",                                   {},           true,          llama_quantize     },
+    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,          llama_perplexity   },
+    {"version",       "Show version",                                       {},           false,         version            },
+    {"licenses",      "Show third-party licenses",                          {"credits"},  false,         licenses           },
+    {"help",          "Show available commands",                            {},           false,         help               },
 };

+#undef UPDATE_HIDDEN
+
 static int version(int argc, char ** argv) {
    printf("%s\n", llama_build_info());
    return 0;
@@ -997,3 +997,87 @@ std::vector<common_cached_model_info> common_list_cached_models() {

    return result;
 }
+
+bool common_download_remove(const std::string & hf_repo_with_tag) {
+    namespace fs = std::filesystem;
+
+    auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag);
+
+    if (tag.empty()) {
+        return hf_cache::remove_cached_repo(repo_id);
+    }
+
+    std::string tag_upper = tag;
+    for (char & c : tag_upper) {
+        c = (char) std::toupper((unsigned char) c);
+    }
+
+    auto files = hf_cache::get_cached_files(repo_id);
+    if (files.empty()) {
+        return false;
+    }
+
+    // collect snapshot entries whose tag matches
+    std::vector<fs::path> to_remove;
+    for (const auto & f : files) {
+        auto split = get_gguf_split_info(f.path);
+        if (split.tag == tag_upper) {
+            to_remove.emplace_back(f.local_path);
+        }
+    }
+
+    if (to_remove.empty()) {
+        return false;
+    }
+
+    // resolve blob paths from symlinks before deleting snapshot entries
+    std::vector<fs::path> blobs_to_check;
+    for (const auto & p : to_remove) {
+        std::error_code ec;
+        if (fs::is_symlink(p, ec)) {
+            auto target = fs::read_symlink(p, ec);
+            if (!ec) {
+                blobs_to_check.push_back((p.parent_path() / target).lexically_normal());
+            }
+        }
+    }
+
+    // remove snapshot entries
+    for (const auto & p : to_remove) {
+        std::error_code ec;
+        fs::remove(p, ec);
+        if (ec) {
+            LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str());
+        }
+    }
+
+    if (blobs_to_check.empty()) {
+        return true;
+    }
+
+    // collect blobs still referenced by remaining snapshot entries
+    std::unordered_set<std::string> still_referenced;
+    for (const auto & f : hf_cache::get_cached_files(repo_id)) {
+        fs::path p(f.local_path);
+        std::error_code ec;
+        if (fs::is_symlink(p, ec)) {
+            auto target = fs::read_symlink(p, ec);
+            if (!ec) {
+                still_referenced.insert((p.parent_path() / target).lexically_normal().string());
+            }
+        }
+    }
+
+    // remove orphaned blobs
+    for (const auto & blob : blobs_to_check) {
+        if (still_referenced.find(blob.string()) == still_referenced.end()) {
+            std::error_code ec;
+            fs::remove(blob, ec);
+            if (ec) {
+                LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str());
+            }
+        }
+    }
+
+    return true;
+}
@@ -115,3 +115,10 @@ int common_download_file_single(const std::string & url,
 // resolve and download model from Docker registry
 // return local path to downloaded model file
 std::string common_docker_resolve_model(const std::string & docker);
+
+// Remove a cached model from disk
+// input format: "user/model" or "user/model:tag"
+// - if tag is omitted, removes the entire repo cache directory
+// - if tag is present, removes only files matching that tag (and orphaned blobs)
+// returns true if anything was removed
+bool common_download_remove(const std::string & hf_repo_with_tag);
@@ -495,4 +495,19 @@ std::string finalize_file(const hf_file & file) {
    return file.final_path;
 }

+bool remove_cached_repo(const std::string & repo_id) {
+    if (!is_valid_repo_id(repo_id)) {
+        LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
+        return false;
+    }
+    fs::path repo_path = get_repo_path(repo_id);
+    std::error_code ec;
+    auto removed = fs::remove_all(repo_path, ec);
+    if (ec) {
+        LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str());
+        return false;
+    }
+    return removed > 0;
+}
+
 } // namespace hf_cache
@@ -29,4 +29,7 @@ hf_files get_cached_files(const std::string & repo_id = {});
 // Create snapshot path (link or move/copy) and return it
 std::string finalize_file(const hf_file & file);

+// Remove the entire cached directory for a repo, returns true if removed
+bool remove_cached_repo(const std::string & repo_id);
+
 } // namespace hf_cache
@@ -161,6 +161,64 @@ You could update your test result in it directly.

 Please refer to [Docker with SYCL](../docker.md#docker-with-sycl) for details.

+## Quick Development WOW
+
+This chapter is for quick development & try with SYCL backend on Intel GPU.
+
+You need to install following sofeware before development:
+   - Intel GPU driver
+   - oneAPI package
+   - other development tools.
+
+Please refer to [Linux](#linux) or [Windows](#windows-1) for above installation and resolve the trouble in usage. There are the detailed guide.
+
+- Linux
+
+```
+## build from source code
+./examples/sycl/build.sh
+
+## run CONV_2D_DW unit test cases
+./build/bin/test-backend-ops -b SYCL0 -o CONV_2D_DW
+
+## run all unit test cases
+./build/bin/test-backend-ops -b SYCL0
+
+## run with LLM on the first GPU
+./examples/sycl/test.sh -mg 0 -m xxxx.gguf
+
+## run service with LLM on the first GPU
+export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+./examples/sycl/start-svr.sh -m xxxx.gguf
+
+## update the docs/ops.md for new/update OPs
+./examples/sycl/update-ops-doc.sh
+```
+
+- Windows
+
+```
+## build from source code
+examples\sycl\win-build-sycl.bat
+
+## run CONV_2D_DW unit test cases
+build\bin\test-backend-ops.exe -b SYCL0 -o CONV_2D_DW
+
+## run all unit test cases
+build\bin\test-backend-ops.exe -b SYCL0
+
+## run LLM on the first GPU
+examples\sycl\win-test.bat -mg 0 -m xxxx.gguf
+
+## run service with LLM on the first GPU
+set ONEAPI_DEVICE_SELECTOR="level_zero:0"
+examples\sycl\win-start-svr.bat -m xxxx.gguf
+
+## update the docs/ops.md for new/update OPs
+examples\sycl\win-update-ops-doc.bat
+```
+
+
 ## Linux

 ### I. Setup Environment
@@ -701,7 +759,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_GRAPH    | ON *(default)* \|OFF *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
-| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
+| GGML_SYCL_SUPPORT_LEVEL_ZERO_API | ON *(default)* \|OFF *(Optional)* | Support to use Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

@@ -716,7 +774,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
-| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
+| GGML_SYCL_USE_LEVEL_ZERO_API | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO_API=ON at build time. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
@@ -27,11 +27,11 @@ Legend:
 |                        COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2026 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+./build/bin/test-backend-ops support --output csv > docs/ops/SYCL.csv
+./scripts/create_ops_docs.py
+
@@ -0,0 +1,8 @@
+@echo off
+
+rem MIT license
+rem Copyright (C) 2026 Intel Corporation
+rem SPDX-License-Identifier: MIT
+
+build\bin\test-backend-ops support --output csv > docs\ops\SYCL.csv
+python scripts\create_ops_docs.py
@@ -249,7 +249,7 @@ option(GGML_SYCL                            "ggml: use SYCL"
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
 option(GGML_SYCL_HOST_MEM_FALLBACK          "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
-option(GGML_SYCL_SUPPORT_LEVEL_ZERO         "ggml: use Level Zero API in SYCL backend"  ON)
+option(GGML_SYCL_SUPPORT_LEVEL_ZERO_API     "ggml: use Level Zero API in SYCL backend"  ON)
 option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
@@ -438,7 +438,14 @@ if (GGML_CPU_ALL_VARIANTS)
            ggml_add_cpu_backend_variant(power8_2       POWER8  VSX)
            ggml_add_cpu_backend_variant(power9         POWER9  VSX)
            ggml_add_cpu_backend_variant(power10        POWER10 VSX)
-            ggml_add_cpu_backend_variant(power11        POWER11 VSX)
+            # POWER11 backend: only if compiler supports -mcpu=power11
+            check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11)
+            if (GGML_CXX_SUPPORTS_POWER11)
+                message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend")
+                ggml_add_cpu_backend_variant(power11 POWER11 VSX)
+            else()
+                message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11")
+            endif()
        else()
            message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
        endif()
@@ -389,7 +389,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

-            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
+            if (EXTRACTED_NUMBER EQUAL 10 OR EXTRACTED_NUMBER EQUAL 11)
                list(APPEND ARCH_FLAGS -mcpu=power10)
            elseif (EXTRACTED_NUMBER EQUAL 9)
                list(APPEND ARCH_FLAGS -mcpu=power9)
@@ -66,7 +66,6 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_base(ggml
    const char * op_str = "undefined";
    switch (op) {
        case GGML_OP_ADD_ID: op_str = "add_id"; break;
-        case GGML_OP_CONCAT: op_str = "concat"; break;
        default: GGML_ABORT("fatal error");
    };

@@ -211,6 +210,21 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat(ggml_meta
    return res;
 }

+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat(ggml_metal_library_t lib, ggml_type tsrc) {
+    char base[256];
+    char name[256];
+
+    snprintf(base, 256, "kernel_concat_%s", ggml_type_name(tsrc));
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary(ggml_metal_library_t lib, const ggml_tensor * op) {
    char base[256];
    char name[256];
@@ -1689,7 +1703,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm(ggml_metal_
 }

 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_library_t lib, const ggml_tensor * op) {
-    assert(op->op == GGML_OP_ROPE);
+    assert(op->op == GGML_OP_ROPE || op->op == GGML_OP_ROPE_BACK);
+
+    const bool is_back = op->op == GGML_OP_ROPE_BACK;

    char base[256];
    char name[256];
@@ -1713,13 +1729,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_
        snprintf(base, 256, "kernel_rope_norm_%s", ggml_type_name(op->src[0]->type));
    }

-    snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0);
+    snprintf(name, 256, "%s_imrope=%d_is_back=%d", base, is_imrope ? 1 : 0, is_back ? 1 : 0);

    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
    if (!res.pipeline) {
        ggml_metal_cv_t cv = ggml_metal_cv_init();

        ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0);
+        ggml_metal_cv_set_bool(cv, is_back,   FC_ROPE + 1);

        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);

@@ -115,6 +115,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows          (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat            (ggml_metal_library_t lib, enum ggml_type tsrc);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat            (ggml_metal_library_t lib, enum ggml_type tsrc);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary             (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu               (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum               (ggml_metal_library_t lib, const struct ggml_tensor * op);
@@ -1123,13 +1123,24 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
            return true;
        case GGML_OP_CONCAT:
            {
-                // kernel_concat copies one float-sized value per element.
-                // Other scalar types need a type-generic copy kernel first.
                const enum ggml_type src0_type = op->src[0]->type;
                const enum ggml_type src1_type = op->src[1]->type;
-                return src0_type == src1_type &&
-                       src0_type == op->type &&
-                       (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_I32);
+                if (src0_type != src1_type || src0_type != op->type) {
+                    return false;
+                }
+                switch (src0_type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_I8:
+                    case GGML_TYPE_I16:
+                    case GGML_TYPE_I32:
+                    case GGML_TYPE_I64:
+                        return true;
+                    case GGML_TYPE_BF16:
+                        return has_bfloat;
+                    default:
+                        return false;
+                }
            }
        case GGML_OP_ADD:
        case GGML_OP_SUB:
@@ -1173,6 +1184,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
        case GGML_OP_RMS_NORM:
            return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
            return true;
        case GGML_OP_IM2COL:
            return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
@@ -375,6 +375,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
                n_fuse = ggml_metal_op_norm(ctx, idx);
            } break;
        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
            {
                n_fuse = ggml_metal_op_rope(ctx, idx);
            } break;
@@ -556,7 +557,7 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
        /*.dim  =*/ dim,
    };

-    auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
+    auto pipeline = ggml_metal_library_get_pipeline_concat(lib, op->type);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -4358,6 +4358,7 @@ template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_
 #endif

 constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]];
+constant bool FC_rope_is_back   [[function_constant(FC_ROPE + 1)]];

 static float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
@@ -4381,6 +4382,9 @@ static void rope_yarn(
    }
    *cos_theta = cos(theta) * mscale;
    *sin_theta = sin(theta) * mscale;
+    if (FC_rope_is_back) {
+        *sin_theta *= -1.0f;
+    }
 }

 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
@@ -7513,14 +7517,15 @@ template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<
 template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
 template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;

+template<typename T>
 kernel void kernel_concat(
-    constant ggml_metal_kargs_concat & args,
-    device  const char * src0,
-    device  const char * src1,
-    device        char * dst,
-    uint3   tgpig[[threadgroup_position_in_grid]],
-    ushort3 tpitg[[thread_position_in_threadgroup]],
-    ushort3   ntg[[threads_per_threadgroup]]) {
+        constant ggml_metal_kargs_concat & args,
+        device  const char * src0,
+        device  const char * src1,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {

    const int i3 = tgpig.z;
    const int i2 = tgpig.y;
@@ -7533,21 +7538,33 @@ kernel void kernel_concat(
    int o[4] = {0, 0, 0, 0};
    o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));

-    device const float * x;
-
    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        device const T * x;
+
        if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
-            x = (device const float *)(src0 + (i3       )*args.nb03 + (i2       )*args.nb02 + (i1       )*args.nb01 + (i0       )*args.nb00);
+            x = (device const T *)(src0 + (i3       )*args.nb03 + (i2       )*args.nb02 + (i1       )*args.nb01 + (i0       )*args.nb00);
        } else {
-            x = (device const float *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
+            x = (device const T *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
        }

-        device float * y = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+        device T * y = (device T *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);

        *y = *x;
    }
 }

+typedef decltype(kernel_concat<float>) kernel_concat_t;
+
+template [[host_name("kernel_concat_f32")]]  kernel kernel_concat_t kernel_concat<float>;
+template [[host_name("kernel_concat_f16")]]  kernel kernel_concat_t kernel_concat<half>;
+#if defined(GGML_METAL_HAS_BF16)
+template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat<bfloat>;
+#endif
+template [[host_name("kernel_concat_i8")]]   kernel kernel_concat_t kernel_concat<char>;
+template [[host_name("kernel_concat_i16")]]  kernel kernel_concat_t kernel_concat<short>;
+template [[host_name("kernel_concat_i32")]]  kernel kernel_concat_t kernel_concat<int>;
+template [[host_name("kernel_concat_i64")]]  kernel kernel_concat_t kernel_concat<long>;
+
 template<int nr0, typename args_t>
 void kernel_mul_mv_q2_K_f32_impl(
        args_t args,
@@ -39,8 +39,8 @@ if (WIN32)
        set(CMAKE_CXX_COMPILER "icx")
        set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
    endif()
-    # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled)
-    if(GGML_SYCL_SUPPORT_LEVEL_ZERO)
+    # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO_API is enabled)
+    if(GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
        if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH})
            set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH})
            if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}")
@@ -105,8 +105,8 @@ endif()

 target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")

-message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
-if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
+message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO_API ${GGML_SYCL_SUPPORT_LEVEL_ZERO_API}")
+if (GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
    # Link against Level Zero loader for direct device memory allocation.
    # Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging
    # in the xe kernel driver during multi-GPU inference.
@@ -114,7 +114,7 @@ if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
    find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH)
    if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
        target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
-        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
+        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
        message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}")
        message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}")
    else()
@@ -12,7 +12,7 @@

 #include "common.hpp"
 #include <sycl/backend.hpp>
-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
 #include <level_zero/ze_api.h>
 #endif

@@ -84,9 +84,9 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
  return sycl_down_blk_size;
 }

-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
 static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
-    return g_ggml_sycl_enable_level_zero &&
+    return g_ggml_sycl_use_level_zero_api &&
        q.get_device().is_gpu() &&
        q.get_backend() == sycl::backend::ext_oneapi_level_zero;
 }
@@ -95,7 +95,7 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
 // Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
 // DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
 void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
    if (ggml_sycl_use_level_zero_device_alloc(q)) {
        void *ptr = nullptr;
        auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
@@ -127,7 +127,7 @@ void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {

 void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
    if (!ptr) return;
-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
    if (ggml_sycl_use_level_zero_device_alloc(q)) {
        auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
        zeMemFree(ze_ctx, ptr);
@@ -324,7 +324,7 @@ struct ggml_tensor_extra_gpu {
  optimize_feature optimized_feature;
 };

-extern int g_ggml_sycl_enable_level_zero;
+extern int g_ggml_sycl_use_level_zero_api;
 void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
 void ggml_sycl_free_device(void *ptr, sycl::queue &q);

@@ -0,0 +1,158 @@
+#include "conv2d-dw.hpp"
+
+struct conv2d_dw_params {
+    int in_w, in_h;
+    int out_w, out_h;
+    int kernel_w, kernel_h;
+    int stride_x, stride_y;
+    int padding_x, padding_y;
+    int dilation_x, dilation_y;
+    int channels, batches;
+};
+
+struct conv2d_dw_kernel_bounds {
+    int y_min, y_max;
+    int x_min, x_max;
+};
+
+static inline conv2d_dw_kernel_bounds dw_calculate_kernel_bounds(int out_x, int out_y,
+                                                                  const conv2d_dw_params & p) {
+    conv2d_dw_kernel_bounds bounds;
+    bounds.y_min = sycl::max(0, (p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
+    bounds.y_max = sycl::min(p.kernel_h,
+                             (p.in_h + p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
+    bounds.x_min = sycl::max(0, (p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
+    bounds.x_max = sycl::min(p.kernel_w,
+                             (p.in_w + p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
+    return bounds;
+}
+
+static inline int dw_calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
+    return out_coord * stride + kern_coord * dilation - padding;
+}
+
+// whcn layout: input/output stored as [N, C, H, W]
+struct dw_whcn_layout {
+    static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
+        return n * (p.channels * p.in_w * p.in_h) + c * p.in_w * p.in_h + y * p.in_w + x;
+    }
+    static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
+        return c * p.kernel_h * p.kernel_w + ky * p.kernel_w + kx;
+    }
+    static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
+        return n * (p.channels * p.out_w * p.out_h) + c * p.out_w * p.out_h + y * p.out_w + x;
+    }
+    static void unpack_indices(int global_idx, const conv2d_dw_params & p,
+                               int & n, int & c, int & out_y, int & out_x) {
+        out_x  = global_idx % p.out_w;
+        out_y  = (global_idx / p.out_w) % p.out_h;
+        c      = (global_idx / (p.out_w * p.out_h)) % p.channels;
+        n      = global_idx / (p.out_w * p.out_h * p.channels);
+    }
+};
+
+// cwhn layout: input/output stored as [N, H, W, C]
+struct dw_cwhn_layout {
+    static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
+        return n * (p.channels * p.in_w * p.in_h) + (y * p.in_w + x) * p.channels + c;
+    }
+    static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
+        return (ky * p.kernel_w + kx) * p.channels + c;
+    }
+    static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
+        return n * (p.channels * p.out_w * p.out_h) + y * (p.out_w * p.channels) + x * p.channels + c;
+    }
+    static void unpack_indices(int global_idx, const conv2d_dw_params & p,
+                               int & n, int & c, int & out_y, int & out_x) {
+        c      = global_idx % p.channels;
+        out_x  = (global_idx / p.channels) % p.out_w;
+        out_y  = (global_idx / (p.channels * p.out_w)) % p.out_h;
+        n      = global_idx / (p.channels * p.out_w * p.out_h);
+    }
+};
+
+template <typename Layout>
+static void conv2d_dw_kernel(const float * input, const float * kernel, float * output,
+                             const conv2d_dw_params p, const sycl::nd_item<3> & item_ct1) {
+    const int global_idx     = item_ct1.get_local_id(2) +
+                               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    const int total_elements = p.batches * p.channels * p.out_h * p.out_w;
+
+    if (global_idx >= total_elements) {
+        return;
+    }
+
+    int n, c, out_y, out_x;
+    Layout::unpack_indices(global_idx, p, n, c, out_y, out_x);
+
+    float acc = 0.0f;
+    const conv2d_dw_kernel_bounds bounds = dw_calculate_kernel_bounds(out_x, out_y, p);
+
+    for (int ky = bounds.y_min; ky < bounds.y_max; ++ky) {
+        const int in_y = dw_calculate_input_coord(out_y, ky, p.stride_y, p.dilation_y, p.padding_y);
+        for (int kx = bounds.x_min; kx < bounds.x_max; ++kx) {
+            const int in_x = dw_calculate_input_coord(out_x, kx, p.stride_x, p.dilation_x, p.padding_x);
+            acc += input[Layout::input_index(n, c, in_y, in_x, p)] *
+                   kernel[Layout::kernel_index(c, ky, kx, p)];
+        }
+    }
+
+    output[Layout::output_index(n, c, out_y, out_x, p)] = acc;
+}
+
+template <typename Layout>
+static void conv2d_dw_sycl(const float * x_d, const float * w_d, float * y_d,
+                            const conv2d_dw_params p, const queue_ptr & stream) {
+    const int total      = p.batches * p.channels * p.out_h * p.out_w;
+    const int num_blocks = (total + SYCL_CONV2D_DW_BLOCK_SIZE - 1) / SYCL_CONV2D_DW_BLOCK_SIZE;
+    const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_DW_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, 1, num_blocks);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            conv2d_dw_kernel<Layout>(x_d, w_d, y_d, p, item_ct1);
+        });
+}
+
+void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+
+    const ggml_tensor * kernel = dst->src[0];
+    const ggml_tensor * input  = dst->src[1];
+
+    GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+
+    const float * w_d = (const float *) kernel->data;
+    const float * x_d = (const float *) input->data;
+    float *       y_d = (float *) dst->data;
+
+    const int32_t * p          = (const int32_t *) dst->op_params;
+    const int       stride_x   = p[0];
+    const int       stride_y   = p[1];
+    const int       padding_x  = p[2];
+    const int       padding_y  = p[3];
+    const int       dilation_x = p[4];
+    const int       dilation_y = p[5];
+
+    const int in_w     = input->ne[0];
+    const int in_h     = input->ne[1];
+    const int kernel_w = kernel->ne[0];
+    const int kernel_h = kernel->ne[1];
+    const int out_w    = dst->ne[0];
+    const int out_h    = dst->ne[1];
+    const int channels = dst->ne[2];
+    const int batches  = dst->ne[3];
+
+    const conv2d_dw_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h,
+                                      stride_x, stride_y, padding_x, padding_y,
+                                      dilation_x, dilation_y, channels, batches };
+
+    const queue_ptr stream = ctx.stream();
+
+    if (ggml_is_contiguous(input)) {
+        conv2d_dw_sycl<dw_whcn_layout>(x_d, w_d, y_d, params, stream);
+    } else if (ggml_is_contiguous_channels(input)) {
+        conv2d_dw_sycl<dw_cwhn_layout>(x_d, w_d, y_d, params, stream);
+    } else {
+        GGML_ABORT("Unsupported memory layout for conv2d_dw");
+    }
+}
@@ -0,0 +1,10 @@
+#ifndef GGML_SYCL_CONV2D_DW_HPP
+#define GGML_SYCL_CONV2D_DW_HPP
+
+#include "common.hpp"
+
+#define SYCL_CONV2D_DW_BLOCK_SIZE 256
+
+void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_CONV2D_DW_HPP
@@ -0,0 +1,125 @@
+#include "conv2d-transpose.hpp"
+#include "convert.hpp"
+
+template <typename kernel_t>
+static void conv2d_transpose_kernel(const float * input, const kernel_t * kernel, float * output,
+                                    const int in_w, const int in_h,
+                                    const int out_w, const int out_h,
+                                    const int kernel_w, const int kernel_h,
+                                    const int stride,
+                                    const int c_in, const int c_out, const int batches,
+                                    const sycl::nd_item<3> & item_ct1) {
+    const int global_idx     = item_ct1.get_local_id(2) +
+                               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    const int total_elements = out_w * out_h * c_out * batches;
+
+    if (global_idx >= total_elements) {
+        return;
+    }
+
+    const int out_x = global_idx % out_w;
+    const int out_y = (global_idx / out_w) % out_h;
+    const int c_idx = (global_idx / (out_w * out_h)) % c_out;
+    const int n_idx = global_idx / (out_w * out_h * c_out);
+
+    float acc = 0.0f;
+
+    for (int c_in_idx = 0; c_in_idx < c_in; ++c_in_idx) {
+        for (int kh = 0; kh < kernel_h; ++kh) {
+            int in_y = out_y - kh;
+            if (in_y < 0 || in_y % stride) {
+                continue;
+            }
+            in_y /= stride;
+            if (in_y >= in_h) {
+                continue;
+            }
+
+            for (int kw = 0; kw < kernel_w; ++kw) {
+                int in_x = out_x - kw;
+                if (in_x < 0 || in_x % stride) {
+                    continue;
+                }
+                in_x /= stride;
+                if (in_x >= in_w) {
+                    continue;
+                }
+
+                const int input_idx  = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + in_w * in_y + in_x;
+                const int kernel_idx = (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx +
+                                       kernel_w * kh + kw;
+
+                acc += input[input_idx] * ggml_sycl_cast<float>(kernel[kernel_idx]);
+            }
+        }
+    }
+
+    output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + out_w * out_y + out_x] = acc;
+}
+
+template <typename kernel_t>
+static void conv2d_transpose_sycl(const float * input_d, const kernel_t * kernel_d, float * output_d,
+                                   const int in_w, const int in_h,
+                                   const int out_w, const int out_h,
+                                   const int kernel_w, const int kernel_h,
+                                   const int stride,
+                                   const int c_in, const int c_out, const int batches,
+                                   const queue_ptr & stream) {
+    const int total      = out_w * out_h * c_out * batches;
+    const int num_blocks = (total + SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE;
+    const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, 1, num_blocks);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            conv2d_transpose_kernel<kernel_t>(input_d, kernel_d, output_d,
+                                             in_w, in_h, out_w, out_h, kernel_w, kernel_h,
+                                             stride, c_in, c_out, batches, item_ct1);
+        });
+}
+
+// input:  (W, H, C_in, N)
+// kernel: (W, H, C_out, C_in)
+// output: (W, H, C_out, N)
+void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+
+    const ggml_tensor * kernel = dst->src[0];
+    const ggml_tensor * input  = dst->src[1];
+
+    GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
+    GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(input));
+    GGML_ASSERT(ggml_is_contiguous(kernel));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    const float * input_d  = (const float *) input->data;
+    float *       output_d = (float *) dst->data;
+    const void *  kernel_d = kernel->data;
+
+    const int input_w      = input->ne[0];
+    const int input_h      = input->ne[1];
+    const int channels_in  = input->ne[2];
+    const int batches      = input->ne[3];
+    const int output_w     = dst->ne[0];
+    const int output_h     = dst->ne[1];
+    const int channels_out = kernel->ne[2];
+    const int kernel_w     = kernel->ne[0];
+    const int kernel_h     = kernel->ne[1];
+    const int stride       = dst->op_params[0];
+
+    GGML_ASSERT(channels_in == kernel->ne[3]);
+    GGML_ASSERT(stride > 0);
+
+    const queue_ptr stream = ctx.stream();
+
+    if (kernel->type == GGML_TYPE_F16) {
+        conv2d_transpose_sycl<sycl::half>(input_d, (const sycl::half *) kernel_d, output_d,
+                                          input_w, input_h, output_w, output_h, kernel_w, kernel_h,
+                                          stride, channels_in, channels_out, batches, stream);
+    } else {
+        conv2d_transpose_sycl<float>(input_d, (const float *) kernel_d, output_d,
+                                     input_w, input_h, output_w, output_h, kernel_w, kernel_h,
+                                     stride, channels_in, channels_out, batches, stream);
+    }
+}
@@ -0,0 +1,10 @@
+#ifndef GGML_SYCL_CONV2D_TRANSPOSE_HPP
+#define GGML_SYCL_CONV2D_TRANSPOSE_HPP
+
+#include "common.hpp"
+
+#define SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE 256
+
+void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_CONV2D_TRANSPOSE_HPP
@@ -0,0 +1,150 @@
+#include "conv2d.hpp"
+#include "convert.hpp"
+
+struct conv2d_params {
+    const int64_t IW, IH;
+    const int64_t OW, OH;
+    const int64_t KW, KH;
+    const int64_t ST_X, ST_Y;
+    const int64_t PD_X, PD_Y;
+    const int64_t DL_X, DL_Y;
+    const int64_t IC, OC;
+    const int64_t B;
+    const int64_t TOTAL;
+};
+
+struct conv2d_kernel_bounds {
+    int64_t y_min, y_max;
+    int64_t x_min, x_max;
+};
+
+static inline int64_t conv2d_max64(int64_t a, int64_t b) {
+    return (a > b) ? a : b;
+}
+
+static inline int64_t conv2d_min64(int64_t a, int64_t b) {
+    return (a < b) ? a : b;
+}
+
+static inline conv2d_kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv2d_params & P) {
+    conv2d_kernel_bounds bounds;
+    bounds.y_min = conv2d_max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+    bounds.y_max = conv2d_min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+    bounds.x_min = conv2d_max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    bounds.x_max = conv2d_min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    return bounds;
+}
+
+static inline int calculate_input_coord(int64_t out_coord, int64_t kern_coord, int64_t stride,
+                                        int64_t dilation, int64_t padding) {
+    return out_coord * stride + kern_coord * dilation - padding;
+}
+
+// whcn layout helpers (matching ggml tensor memory order)
+static inline int64_t whcn_input_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
+    return n * (P.IC * P.IW * P.IH) + c * P.IW * P.IH + y * P.IW + x;
+}
+
+static inline int64_t whcn_kernel_index(int64_t c_out, int64_t c_in, int64_t ky, int64_t kx, const conv2d_params & P) {
+    return c_out * (P.IC * P.KH * P.KW) + c_in * (P.KH * P.KW) + ky * P.KW + kx;
+}
+
+static inline int64_t whcn_output_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
+    return n * (P.OC * P.OW * P.OH) + c * P.OW * P.OH + y * P.OW + x;
+}
+
+template <typename T>
+static void conv2d_kernel(const float * input, const T * kernel, float * output,
+                          const conv2d_params P, const sycl::nd_item<3> & item_ct1) {
+    const int64_t global_idx = item_ct1.get_local_id(2) +
+                               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+
+    if (global_idx >= P.TOTAL) {
+        return;
+    }
+
+    const int64_t out_x  = global_idx % P.OW;
+    const int64_t out_y  = (global_idx / P.OW) % P.OH;
+    const int64_t c_out  = (global_idx / (P.OW * P.OH)) % P.OC;
+    const int64_t n      = global_idx / (P.OW * P.OH * P.OC);
+
+    float acc = 0.0f;
+
+    const conv2d_kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
+
+    for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
+        for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
+            const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
+            for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
+                const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
+                const float input_val  = input[whcn_input_index(n, c_in, in_y, in_x, P)];
+                const T     kernel_val = kernel[whcn_kernel_index(c_out, c_in, ky, kx, P)];
+                acc += input_val * ggml_sycl_cast<float>(kernel_val);
+            }
+        }
+    }
+
+    output[whcn_output_index(n, c_out, out_y, out_x, P)] = acc;
+}
+
+template <typename T>
+static void conv2d_sycl(const float * X_D, const T * K_D, float * Y_D,
+                        const conv2d_params P, const queue_ptr & stream) {
+    const int num_blocks = (P.TOTAL + SYCL_CONV2D_BLOCK_SIZE - 1) / SYCL_CONV2D_BLOCK_SIZE;
+    const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, 1, num_blocks);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            conv2d_kernel<T>(X_D, K_D, Y_D, P, item_ct1);
+        });
+}
+
+void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+
+    const ggml_tensor * kernel = dst->src[0];
+    const ggml_tensor * input  = dst->src[1];
+    const float *       K_D    = (const float *) kernel->data;
+    const float *       X_D    = (const float *) input->data;
+    float *             Y_D    = (float *) dst->data;
+
+    GGML_ASSERT(ggml_is_contiguous(kernel));
+    GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
+    GGML_ASSERT(input->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    // same number of input channels
+    GGML_ASSERT(input->ne[2] == kernel->ne[2]);
+
+    const queue_ptr stream = ctx.stream();
+
+    const int32_t * p    = (const int32_t *) dst->op_params;
+    const int       ST_X = p[0];
+    const int       ST_Y = p[1];
+    const int       PD_X = p[2];
+    const int       PD_Y = p[3];
+    const int       DL_X = p[4];
+    const int       DL_Y = p[5];
+
+    // no cwhn layout support
+    GGML_ASSERT(p[6] == 0);
+
+    const int IW = input->ne[0];
+    const int IH = input->ne[1];
+    const int OW = dst->ne[0];
+    const int OH = dst->ne[1];
+    const int KW = kernel->ne[0];
+    const int KH = kernel->ne[1];
+    const int IC = input->ne[2];
+    const int OC = kernel->ne[3];
+    const int B  = input->ne[3];
+
+    const int64_t     total  = (int64_t) B * OC * OH * OW;
+    const conv2d_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
+
+    if (kernel->type == GGML_TYPE_F16) {
+        conv2d_sycl<sycl::half>(X_D, (const sycl::half *) K_D, Y_D, params, stream);
+    } else {
+        conv2d_sycl<float>(X_D, K_D, Y_D, params, stream);
+    }
+}
@@ -0,0 +1,10 @@
+#ifndef GGML_SYCL_CONV2D_HPP
+#define GGML_SYCL_CONV2D_HPP
+
+#include "common.hpp"
+
+#define SYCL_CONV2D_BLOCK_SIZE 256
+
+void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_CONV2D_HPP
@@ -642,6 +642,8 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct

 to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
    switch (type) {
+        case GGML_TYPE_Q1_0:
+            return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
        case GGML_TYPE_Q4_0:
            if (dst->src[0]->extra &&
                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
@@ -724,6 +726,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {

 to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
    switch (type) {
+        case GGML_TYPE_Q1_0:
+            return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
        case GGML_TYPE_Q4_0:
            if (dst->src[0]->extra &&
                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
@@ -830,6 +834,8 @@ to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
        case GGML_TYPE_BF16:
            return convert_unary_nc_sycl<sycl::ext::oneapi::bfloat16>;
 #endif
+        case GGML_TYPE_Q1_0:
+            return dequantize_block_nc_sycl<QK1_0, QR1_0, dequantize_q1_0>;
        case GGML_TYPE_Q4_0:
            return dequantize_block_nc_sycl<QK4_0, QR4_0, dequantize_q4_0>;
        case GGML_TYPE_Q4_1:
@@ -70,6 +70,21 @@ static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int
 #endif // GGML_SYCL_F16
 }

+static __dpct_inline__ void dequantize_q1_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
+                                            const int iqs, dfloat2 &v) {
+    // Q1_0 reorder layout: scale values followed by quantized bits
+    const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
+
+    const int bit_index_0 = iqs + 0;
+    const int bit_index_1 = iqs + 1;
+
+    const int bit_0 = (*((const uint8_t *)qs + bit_index_0 / 8) >> (bit_index_0 % 8)) & 1;
+    const int bit_1 = (*((const uint8_t *)qs + bit_index_1 / 8) >> (bit_index_1 % 8)) & 1;
+
+    v.x() = (2 * bit_0 - 1) * d;
+    v.y() = (2 * bit_1 - 1) * d;
+}
+
 static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
                                            const int iqs, dfloat2 &v) {
    const block_q4_1 * x = (const block_q4_1 *) vx;
@@ -1423,6 +1423,50 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
    }
 }

+static void dequantize_mul_mat_vec_q1_0_sycl_reorder(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec_reorder<QK1_0, QR1_0, dequantize_q1_0_reorder>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q1_0_sycl(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK1_0, QR1_0, dequantize_q1_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
 static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
                                             float *dst, const int ncols,
                                             const int nrows,
@@ -1759,6 +1803,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
    sycl::half *src1_dfloat = nullptr; // dfloat == half

    bool src1_convert_f16 =
+        src0->type == GGML_TYPE_Q1_0 ||
        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
@@ -1777,6 +1822,14 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
 #endif // GGML_SYCL_F16

    switch (src0->type) {
+        case GGML_TYPE_Q1_0:
+            if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+                dequantize_mul_mat_vec_q1_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            } else {
+                dequantize_mul_mat_vec_q1_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            }
+            break;
        case GGML_TYPE_Q4_0:
            if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
@@ -32,7 +32,7 @@

 #include <sycl/sycl.hpp>
 #include <sycl/backend.hpp>
-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
 #include <level_zero/ze_api.h>
 #endif
 #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
@@ -62,6 +62,9 @@
 #include "ggml-sycl/repeat_back.hpp"
 #include "ggml-sycl/set_rows.hpp"
 #include "ggml-sycl/set.hpp"
+#include "ggml-sycl/conv2d.hpp"
+#include "ggml-sycl/conv2d-dw.hpp"
+#include "ggml-sycl/conv2d-transpose.hpp"
 #include "ggml-sycl/ssm_conv.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
 #include "ggml-sycl/ssm_scan.hpp"
@@ -84,7 +87,7 @@ int g_ggml_sycl_enable_vmm = 1;
 int g_ggml_sycl_prioritize_dmmv = 0;
 int g_ggml_sycl_use_async_mem_op = 0;
 int g_ggml_sycl_use_async_mem_op_requested = 1;
-int g_ggml_sycl_enable_level_zero = 0;
+int g_ggml_sycl_use_level_zero_api = 0;
 int g_ggml_sycl_enable_flash_attention = 1;
 int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
 int g_ggml_sycl_usm_system = 0;
@@ -154,7 +157,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
            info.ext_oneapi_level_zero = false;
        }

-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
        if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) {
            ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device.default_queue().get_device());
            ze_device_properties_t props = {};
@@ -169,13 +172,13 @@ static ggml_sycl_device_info ggml_sycl_init() {
        info.default_tensor_split[id] /= total_vram;
    }

-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
    // Large buffers can be allocated before ggml_check_sycl() initializes other
    // g_ggml_sycl_enable_* globals, so initialize this one as early as we can.
-    g_ggml_sycl_enable_level_zero =
-        info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
+    g_ggml_sycl_use_level_zero_api =
+        info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_USE_LEVEL_ZERO_API", 1);
 #else
-    g_ggml_sycl_enable_level_zero = 0;
+    g_ggml_sycl_use_level_zero_api = 0;
 #endif

    return info;
@@ -274,7 +277,7 @@ static void ggml_check_sycl() try {
        g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);

        g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL);
-        if (g_ggml_sycl_enable_level_zero == 0) {
+        if (g_ggml_sycl_use_level_zero_api == 0) {
            g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
        }

@@ -309,10 +312,10 @@ static void ggml_check_sycl() try {
 #else
        GGML_LOG_INFO("  GGML_SYCL_DNNL: no\n");
 #endif
-#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
-        GGML_LOG_INFO("  GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
+#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
+        GGML_LOG_INFO("  GGML_SYCL_SUPPORT_LEVEL_ZERO_API: yes\n");
 #else
-        GGML_LOG_INFO("  GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
+        GGML_LOG_INFO("  GGML_SYCL_SUPPORT_LEVEL_ZERO_API: no\n");
 #endif
 #if defined(GGML_SYCL_USE_VMM)
        GGML_LOG_INFO("  GGML_SYCL_USE_VMM: yes\n");
@@ -328,12 +331,12 @@ static void ggml_check_sycl() try {
 #else
        GGML_LOG_INFO("  GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
 #endif
-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
-        GGML_LOG_INFO("  GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
+        GGML_LOG_INFO("  GGML_SYCL_USE_LEVEL_ZERO_API: %d\n", g_ggml_sycl_use_level_zero_api);
        GGML_LOG_INFO("  GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy);
 #else
-        GGML_LOG_INFO("  GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
-        GGML_LOG_INFO("  GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n",
+        GGML_LOG_INFO("  GGML_SYCL_USE_LEVEL_ZERO_API: Disable Level Zero API usage by compile flag\n");
+        GGML_LOG_INFO("  GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO_API\n",
                      g_ggml_sycl_dev2dev_memcpy);
 #endif
 #if GGML_SYCL_DNNL
@@ -599,7 +602,7 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }

-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
 static bool ggml_sycl_is_l0_discrete_gpu(int device) {
    return ggml_sycl_info().devices[device].l0_discrete_gpu;
 }
@@ -608,12 +611,12 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) {
 static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
                    const void *ptr_src, size_t size) {

-#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
    if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) {
        // Use Level Zero direct copy for dGPU-to-dGPU transfers.
        const bool l0_copy_supported =
            ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
-        if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
+        if (g_ggml_sycl_use_level_zero_api && l0_copy_supported) {
            auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
            auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
            ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
@@ -973,6 +976,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
    }

    switch(type) {
+        case GGML_TYPE_Q1_0:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
            return max_compute_capability >= VER_GEN9 ? 128 : 64;
@@ -3504,6 +3508,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {

 inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
    switch (type) {
+        case GGML_TYPE_Q1_0:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q8_0:
            return true;
@@ -3519,6 +3524,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {

 inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
    switch (type) {
+        case GGML_TYPE_Q1_0:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q8_0:
            return true;
@@ -3529,6 +3535,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {

 inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
    switch (type) {
+        case GGML_TYPE_Q1_0:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q3_K:
@@ -3543,6 +3550,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {

 static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
    switch (type) {
+        case GGML_TYPE_Q1_0:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
@@ -4664,12 +4672,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_ARGMAX:
            ggml_sycl_argmax(ctx, dst);
            break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            ggml_sycl_op_conv_transpose_1d(ctx, dst);
+        case GGML_OP_CONV_2D:
+            ggml_sycl_op_conv2d(ctx, dst);
+            break;
+        case GGML_OP_CONV_2D_DW:
+            ggml_sycl_op_conv2d_dw(ctx, dst);
            break;
        case GGML_OP_CONV_3D:
            ggml_sycl_conv_3d(ctx, dst);
            break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            ggml_sycl_op_conv_transpose_1d(ctx, dst);
+            break;
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            ggml_sycl_op_conv2d_transpose(ctx, dst);
+            break;
        case GGML_OP_REPEAT:
            ggml_sycl_repeat(ctx, dst);
            break;
@@ -5373,7 +5390,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_
    return nullptr;
 }

-static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+static bool do_ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    ggml_backend_sycl_device_context *sycl_ctx =
        (ggml_backend_sycl_device_context *)dev->context;
    int device = sycl_ctx->device;
@@ -5387,6 +5404,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                }
                return false;
            }
+        case GGML_OP_CONV_2D:
+        case GGML_OP_CONV_2D_DW:
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            return true;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_SGN:
@@ -5434,19 +5455,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                struct ggml_tensor * a = op->src[0];
                struct ggml_tensor * b = op->src[1];

-                // disable Q1_0 until implementation
-                if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
-                    return false;
-                }
-
                if (a->ne[3] != b->ne[3]) {
                    return false;
                }

                ggml_type src0_type = op->src[0]->type;

-
-
                // TODO: The configuration below needs more work to be supported with oneDNN
                if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
                    a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
@@ -5456,12 +5470,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                // TODO: This specific configuration can fail with oneDNN and needs more debugging
                if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
                    a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
+                        printf("zjy 2\n");
                    return false;
                }
                return true;
            }
        case GGML_OP_OUT_PROD:
-            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
+            return op->type == GGML_TYPE_F32 &&
+                   (op->src[0]->type == GGML_TYPE_F32 ||
+                    (op->src[0]->type == GGML_TYPE_Q1_0 && op->src[0]->ne[2] == op->src[1]->ne[2] &&
+                     op->src[0]->ne[3] == op->src[1]->ne[3])) &&
+                   op->src[1]->type == GGML_TYPE_F32;
        case GGML_OP_GET_ROWS:
            {
                switch (op->src[0]->type) {
@@ -5718,6 +5737,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
    GGML_UNUSED(dev);
 }

+static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    bool res = do_ggml_backend_sycl_device_supports_op(dev, op);
+    GGML_SYCL_DEBUG("[SYCL] call %s op->op=%s op->type=%s -> %s\n", __func__, ggml_op_name(op->op),
+                    ggml_type_name(op->type), res ? "true" : "false");
+    return res;
+}
+
 static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
    if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) {
        return false;
@@ -1194,6 +1194,66 @@ static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols(
    }
 }

+static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy,
+                                       float * dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK1_0 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                mul_mat_vec_q<QK1_0, QI1_0, block_q1_0,
+                              VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1>(
+                    vx, vy, dst, ncols, nrows, item_ct1);
+            });
+    });
+}
+
+template <int ncols_dst>
+static void mul_mat_vec_q1_0_q8_1_sycl_ncols(
+        const void * vx, const void * vy, float * dst,
+        const int ncols, const int nrows,
+        const int stride_col_y, const int stride_col_dst,
+        dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK1_0 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                mul_mat_vec_q_ncols<QK1_0, QI1_0, block_q1_0,
+                                    VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1, ncols_dst>(
+                    vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
+        const void * vx, const void * vy, float * dst,
+        const int ncols, const int nrows, const int ncols_dst,
+        const int stride_col_y, const int stride_col_dst,
+        dpct::queue_ptr stream) {
+    switch (ncols_dst) {
+        case 1: mul_mat_vec_q1_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break;
+        case 2: mul_mat_vec_q1_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
+        case 3: mul_mat_vec_q1_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
+        case 4: mul_mat_vec_q1_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
+        case 5: mul_mat_vec_q1_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
+        case 6: mul_mat_vec_q1_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
+        case 7: mul_mat_vec_q1_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
+        case 8: mul_mat_vec_q1_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
+        default: GGML_ABORT("unsupported ncols_dst=%d for Q1_0 multi-col MMVQ", ncols_dst);
+    }
+}
+
 static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
@@ -2120,6 +2180,20 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
                    mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                }
                break;
+            case GGML_TYPE_Q1_0:
+                if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
+                    const int stride_col_y   = src1_padded_col_size / QK8_1;
+                    const int stride_col_dst = dst->ne[0];
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols);
+                    mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
+                        src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff,
+                        src1_ncols, stride_col_y, stride_col_dst, stream);
+                    return;
+                } else if (i == 0 || src1_ncols == 1) {
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl\n");
+                    mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                }
+                break;
            case GGML_TYPE_Q2_K:
                if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
                    const int stride_col_y   = src1_padded_col_size / QK8_1;
@@ -1,11 +1,12 @@
 #include "outprod.hpp"
+#include "convert.hpp"

 void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
    const ggml_tensor *src0 = dst->src[0];
    const ggml_tensor *src1 = dst->src[1];

-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q1_0);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0));
@@ -20,11 +21,31 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
    GGML_ASSERT(ne01 == ne11);  // Inner dimensions must match
    GGML_ASSERT(ne0 == ne00);   // Output rows match src0 rows
    GGML_ASSERT(ne1 == ne10);   // Output cols match src1 cols
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+    GGML_ASSERT(ne2 % ne02 == 0);
+    GGML_ASSERT(ne3 % ne03 == 0);

    // Get data pointers
-    const float* src0_d = (const float*)src0->data;
-    const float* src1_d = (const float*)src1->data;
-    float* dst_d = (float*)dst->data;
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float * dst_d = (float *) dst->data;
+
+    ggml_sycl_pool_alloc<float> src0_as_f32(ctx.pool());
+    int64_t src0_nb02 = nb02;
+    int64_t src0_nb03 = nb03;
+    if (src0->type == GGML_TYPE_Q1_0) {
+        scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
+                                             " : converting src0 Q1_0 to fp32");
+        src0_d = src0_as_f32.alloc(ne00 * ne01 * ne02 * ne03);
+        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
+        GGML_ASSERT(to_fp32_sycl != nullptr);
+        to_fp32_sycl(src0->data, const_cast<float *>(src0_d), ne00 * ne01 * ne02 * ne03, stream);
+
+        // Dequantized src0 buffer is contiguous fp32 [ne00, ne01, ne02, ne03].
+        src0_nb02 = ne00 * ne01 * (int64_t) sizeof(float);
+        src0_nb03 = ne00 * ne01 * ne02 * (int64_t) sizeof(float);
+    }

    // GEMM parameters
    const float alpha = 1.0f;
@@ -35,12 +56,27 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
    const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
    const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);

+    const int64_t r2 = ne2 / ne02;
+    const int64_t r3 = ne3 / ne03;
+
    try {
-        // Perform matrix multiplication using oneMKL GEMM
-        oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
-                                               ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
-    }
-    catch (sycl::exception const& exc) {
+        // OUT_PROD applies independently to each (i2, i3) destination plane.
+        for (int64_t i3 = 0; i3 < ne3; ++i3) {
+            for (int64_t i2 = 0; i2 < ne2; ++i2) {
+                const int64_t i03 = i3 / r3;
+                const int64_t i02 = i2 / r2;
+
+                const float * src0_plane = (const float *) ((const char *) src0_d + i02 * src0_nb02 + i03 * src0_nb03);
+                const float * src1_plane = (const float *) ((const char *) src1_d + i2 * nb12 + i3 * nb13);
+                float * dst_plane = (float *) ((char *) dst_d + i2 * nb2 + i3 * nb3);
+
+                // Perform matrix multiplication using oneMKL GEMM
+                oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
+                                                      ne0, ne1, ne01, alpha, src0_plane, ne00,
+                                                      src1_plane, ldb, beta, dst_plane, ne0);
+            }
+        }
+    } catch (sycl::exception const& exc) {
        std::cerr << exc.what() << std::endl;
        GGML_ASSERT(false);
    }
@@ -309,6 +309,41 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
        vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
 }

+#define VDR_Q1_0_Q8_1_MMVQ 1
+#define VDR_Q1_0_Q8_1_MMQ  4
+
+static __dpct_inline__ float
+vec_dot_q1_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq;
+
+    const block_q8_1 * bq8_1_chunk = bq8_1 + iqs;
+    const float        d1          = bq1_0->d;
+    const int          v           = get_int_from_uint8_aligned(bq1_0->qs, iqs);
+
+    int vi_bytes[8];
+#pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        const int shift = j * 4;
+        const int bits4 = (v >> shift) & 0x0F;
+        const int b0    = (bits4 & 0x01) ? 1 : -1;
+        const int b1    = (bits4 & 0x02) ? 1 : -1;
+        const int b2    = (bits4 & 0x04) ? 1 : -1;
+        const int b3    = (bits4 & 0x08) ? 1 : -1;
+        vi_bytes[j]     = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
+    }
+
+    int sumi = 0;
+#pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        const int u = get_int_from_int8_aligned(bq8_1_chunk->qs, j);
+        sumi        = ggml_sycl_dp4a(vi_bytes[j], u, sumi);
+    }
+
+    return d1 * bq8_1_chunk->ds[0] * sumi;
+}
+
 // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
 // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q

@@ -1382,7 +1382,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
    const auto & hparams = model.hparams;

    // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
-    const int64_t n_embd = hparams.n_embd_inp();
+    const int64_t n_embd = hparams.n_embd_inp_enc();
    const int64_t n_vocab = model.vocab.n_tokens();

    // note: during encode, we always pass the full sequence starting from pos = 0
@@ -104,6 +104,10 @@ uint32_t llama_hparams::n_embd_inp() const {
    return n_embd_inp;
 }

+uint32_t llama_hparams::n_embd_inp_enc() const {
+    return n_embd_inp_enc_impl > 0 ? n_embd_inp_enc_impl : n_embd_inp();
+}
+
 uint32_t llama_hparams::n_embd_out() const {
    return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
 }
@@ -189,6 +189,10 @@ struct llama_hparams {
    // input embedding dimension (0 = use n_embd)
    uint32_t n_embd_inp_impl = 0;

+    // encoder input embedding dimension (0 = use n_embd_inp())
+    // e.g. the eagle3 encoder fuses target_layers * target_hidden features
+    uint32_t n_embd_inp_enc_impl = 0;
+
    // output embedding dimension (0 = use n_embd)
    uint32_t n_embd_out_impl = 0;

@@ -305,6 +309,9 @@ struct llama_hparams {
    // dimension of main + auxiliary input embeddings
    uint32_t n_embd_inp() const;

+    // dimension of the encoder input embeddings
+    uint32_t n_embd_inp_enc() const;
+
    // dimension of output embeddings
    uint32_t n_embd_out() const;

@@ -249,7 +249,7 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama
    }

    // if using single GPU mode, remove all except the main GPU
-    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
+    if (params.split_mode == LLAMA_SPLIT_MODE_NONE && !model->devices.empty()) {
        if (params.main_gpu < 0) {
            model->devices.clear();
        } else {
@@ -19,7 +19,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
    LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);

-    hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
+    hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;

    // eagle3 norm_before_residual (optional, default false)
    // compatible with Readhat eagle3 speculator model
@@ -34,7 +34,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
 void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
    LLAMA_LOAD_LOCALS;

-    const int64_t n_embd_inp = hparams.n_embd_inp();
+    const int64_t n_embd_inp = hparams.n_embd_inp_enc();
    const int64_t n_embd_attn_input = 2 * n_embd;

    // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
@@ -109,8 +109,8 @@ ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {

    // Input: Target model features (3 layers concatenated: low, mid, high)
    // Data will be provided via ubatch->embd in encode_eagle3_features()
-    auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
-    inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
+    auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp_enc());
+    inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens);
    ggml_set_input(inp_target->embd);

    cur = inp_target->embd;
@@ -1105,6 +1105,8 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
        img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
        output.entries.push_back(std::move(res));
    }
+    output.grid_x = inst.grid_size.width;
+    output.grid_y = inst.grid_size.height;
    return true;
 }

@@ -1558,3 +1560,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
    output.entries.push_back(std::move(img_f32));
    return true;
 }
+
+bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // call super class preprocessor
+    bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
+    if (!ok) {
+        return false;
+    }
+    if (output.entries.size() == 1) {
+        // Single-tile (overview only): append one newline row.
+        output.entries[0]->add_newline = true;
+    } else {
+        // Multi-tile: overview gets no newline, grid tiles get one.
+        output.entries[0]->add_newline = false;
+        for (size_t i = 1; i < output.entries.size(); ++i) {
+            output.entries[i]->add_newline = true;
+        }
+    }
+    return true;
+}
@@ -197,3 +197,9 @@ struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
    mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
+
+// similar to llava_uhd, but has add_newline
+struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
+    mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
@@ -639,7 +639,7 @@ struct mtmd_context {
                {
                    img_beg = "<image>";
                    img_end = "";
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
                } break;
            default:
                throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
    int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
        GGML_ASSERT(!bitmaps.empty());

-        if (!bitmaps[0]->is_audio) {
+        // note: only one type of media is supported per call, caller should enforce this
+        const bool is_vision = !bitmaps[0]->is_audio;
+
+        if (is_vision) {
            // handle image

            if (!ctx->ctx_v) {
@@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
                batch_f32.grid_y = tmp_batch.grid_y;
            }

-            // Annotate llava-next style tiles so clip_n_output_tokens accounts
-            // for per-tile newline injection.
-            if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
-                if (batch_f32.entries.size() == 1) {
-                    // Single-tile (overview only): append one newline row.
-                    batch_f32.entries[0]->add_newline = true;
-                } else {
-                    // Multi-tile: overview gets no newline, grid tiles get one.
-                    batch_f32.entries[0]->add_newline = false;
-                    for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
-                        batch_f32.entries[i]->add_newline = true;
-                    }
-                }
-            }
-
            // handle llava-uhd style preprocessing
            const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
-            if (
-                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
-                || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
-            ) {
+            if (has_tiling_grid) {
                // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
                GGML_ASSERT(bitmaps.size() == 1);

@@ -180,6 +180,24 @@ That requires `JSON.stringify` when formatted to message content:
 }
 ```

+### Model management API (router mode)
+
+Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)
+
+The main goal of this API is to allow downloading models and/or removing models from the web UI. It relies on the model cache infrastructure under the hood to manage the list of models dynamically.
+
+Instead of building everything from the ground up (like what most AI agents will do when you ask them to implement a similar feature), we built on top of existing, already well-engineered components inside the codebase:
+- Model cache infrastructure as mentioned above (`common/download.h`)
+- Server response queue (`server-queue.h`). We use this feature to broadcast events to SSE clients.
+- Server router thread management (`server-models.h`). We re-use the same thread model that is used for managing subprocess life cycle, except that we don't create a new subprocess, but launch the download right inside the thread.
+
+The flow for downloading a new model:
+- POST request comes in --> `post_router_models` --> validation
+- `server_models::download()` is called
+    - Sets up a new thread `inst.th` and runs the download inside
+- If a stop request comes in, set `stop_download` to `true`
+- Otherwise, upon completion, we call `load_models()` to refresh the list of models
+
 ### Notable Related PRs

 - Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443
@@ -1778,6 +1778,20 @@ The `status` object can be:
 }
 ```

+Note: for "downloading" state, there can be multiple files be downloading in parallel
+
+```json
+"status": {
+  "value": "downloading",
+  "progress": {
+    "https://...model.gguf": {
+      "done": 195963406,
+      "total": 219307424
+    }
+  }
+}
+```
+
 ### POST `/models/load`: Load a model

 Load a model
@@ -1820,6 +1834,107 @@ Response:
 }
 ```

+### GET `/models/sse`: Real-time events
+
+Example events:
+
+```js
+{
+  "model": "...",
+  "event": "model_status",
+  "data": {
+    "status": "loading"
+  }
+}
+
+{
+  "model": "...",
+  "event": "download_progress",
+  "data": {
+    // note: there can be multiple files being downloaded in parallel
+    "https://...model.gguf": {
+      "done": 195963406,
+      "total": 219307424
+    }
+  }
+}
+
+{
+  "model": "...",
+  "event": "download_finished",
+  "data": {
+    "status": "loading"
+  }
+}
+
+{
+  "model": "...",
+  "event": "model_remove"
+}
+
+// special event: reload of the list of all models
+{
+  "model": "*",
+  "event": "models_reload"
+}
+```
+
+### POST `/models`: Download new model
+
+Trigger a new download (non-blocking), the progress can be tracked via SSE endpoint `/models/sse`
+
+To cancel model downloading, send an event to `/models/unload`
+
+Download procedure:
+- Send POST request to `/models`
+- Subscribe to `/models/sse` for updates
+- On downloading completed, you will receive either `download_finished` or `download_failed` event
+- Call GET `/models` to trigger model list update. If the download success, you should see the new model in the list
+
+Payload:
+
+```json
+{
+  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
+}
+```
+
+Response (download is started in the background):
+
+```json
+{
+  "success": true
+}
+```
+
+Response (error, cannot start the download):
+
+```json
+{
+  "error": {
+    "code": 400,
+    "message": "model validation failed, unable to download",
+    "type": "invalid_request_error"
+  }
+}
+```
+
+### DELETE `/models`: Delete a model from cache
+
+IMPORTANT: only model stored in cache can be deleted. You cannot delete models in a preset.
+
+Model name must be passed via query param: `?model={name}`
+
+If delete success, it will send an SSE event of type `model_remove`
+
+Response:
+
+```json
+{
+  "success": true
+}
+```
+
 ## API errors

 `llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
@@ -588,6 +588,23 @@ void server_http_context::post(const std::string & path, const server_http_conte
    });
 }

+void server_http_context::del(const std::string & path, const server_http_context::handler_t & handler) const {
+    handlers.emplace(path, handler);
+    pimpl->srv->Delete(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
+            get_params(req),
+            get_headers(req),
+            req.path,
+            build_query_string(req),
+            req.body,
+            {},
+            req.is_connection_closed
+        });
+        server_http_res_ptr response = handler(*request);
+        process_handler_response(std::move(request), response, res);
+    });
+}
+
 //
 // Vertex AI Prediction protocol (AIP_PREDICT_ROUTE)
 // https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements
@@ -86,6 +86,7 @@ struct server_http_context {

    void get(const std::string & path, const handler_t & handler) const;
    void post(const std::string & path, const handler_t & handler) const;
+    void del(const std::string & path, const handler_t & handler) const;

    // Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict)
    // Must be called AFTER all other API routes are registered
@@ -9,6 +9,7 @@
 #include <sheredom/subprocess.h>

 #include <functional>
+#include <optional>
 #include <algorithm>
 #include <thread>
 #include <mutex>
@@ -51,6 +52,21 @@ extern char **environ;
 // ref: https://github.com/ggml-org/llama.cpp/issues/17862
 #define CHILD_ADDR "127.0.0.1"

+struct server_subproc {
+    std::optional<subprocess_s> sproc; // empty while in DOWNLOADING state
+    std::atomic<bool> stop_download{false}; // flag to signal download cancellation
+
+    subprocess_s & get() {
+        GGML_ASSERT(sproc.has_value() && "subprocess not initialized");
+        return sproc.value();
+    }
+
+    bool is_alive() {
+        return sproc.has_value() && subprocess_alive(&sproc.value());
+    }
+};
+
+
 static std::filesystem::path get_server_exec_path() {
 #if defined(_WIN32)
    wchar_t buf[32768] = { 0 };  // Large buffer to handle long paths
@@ -272,12 +288,25 @@ void server_models::add_model(server_model_meta && meta) {
    meta.update_caps();
    std::string name = meta.name;
    mapping[name] = instance_t{
-        /* subproc */ std::make_shared<subprocess_s>(),
+        /* subproc */ std::make_shared<server_subproc>(),
        /* th      */ std::thread(),
        /* meta    */ std::move(meta)
    };
 }

+void server_models::notify_sse(const std::string & event, const std::string & model_id, const json & data) {
+    std::unique_ptr<server_task_result_router> result = std::make_unique<server_task_result_router>();
+    result->data = {
+        {"model", model_id},
+        {"event", event},
+    };
+    if (!data.is_null()) {
+        result->data["data"] = data;
+    }
+    SRV_DBG("notifying SSE clients about event '%s' for model '%s': %s\n", event.c_str(), model_id.c_str(), safe_json_to_str(result->data).c_str());
+    sse.broadcast(std::move(result));
+}
+
 void server_models::load_models() {
    // Phase 1: load presets from all sources — pure I/O, no lock needed
    // 1. cached models
@@ -304,19 +333,27 @@ void server_models::load_models() {

    // note: if a model exists in both cached and local, local takes precedence
    common_presets final_presets;
-    for (const auto & [name, preset] : cached_models) final_presets[name] = preset;
-    for (const auto & [name, preset] : local_models)  final_presets[name] = preset;
+    std::unordered_map<std::string, server_model_source> source_map;
+    for (const auto & [name, preset] : cached_models) {
+        final_presets[name] = preset;
+        source_map[name] = SERVER_MODEL_SOURCE_CACHE;
+    }
+    for (const auto & [name, preset] : local_models)  {
+        final_presets[name] = preset;
+        source_map[name] = SERVER_MODEL_SOURCE_MODELS_DIR;
+    }
    for (const auto & [name, custom] : custom_presets) {
        if (final_presets.find(name) != final_presets.end()) {
            final_presets[name].merge(custom);
        } else {
            final_presets[name] = custom;
        }
+        source_map[name] = SERVER_MODEL_SOURCE_PRESET;
    }
-    // server base preset from CLI args takes highest precedence
-    for (auto & [name, preset] : final_presets) {
-        preset.merge(base_preset);
-    }
+
+    auto get_source = [&](const std::string & name) {
+        return source_map.count(name) ? source_map.at(name) : SERVER_MODEL_SOURCE_PRESET;
+    };

    // Helpers that read `mapping` — must be called while holding the lock.
    std::unordered_set<std::string> custom_names;
@@ -366,12 +403,15 @@ void server_models::load_models() {
    // (unload, load) or when joining threads (the monitoring thread calls update_status
    // which locks the mutex, so joining while holding it would deadlock).
    std::unique_lock<std::mutex> lk(mutex);
+
+    need_reload = false;
    bool is_first_load = mapping.empty();

    if (is_first_load) {
        // FIRST LOAD: add all models, then unlock for autoloading
        for (const auto & [name, preset] : final_presets) {
            server_model_meta meta{
+                /* source        */ get_source(name),
                /* preset        */ preset,
                /* name          */ name,
                /* aliases       */ {},
@@ -384,7 +424,7 @@ void server_models::load_models() {
                /* exit_code     */ 0,
                /* stop_timeout  */ DEFAULT_STOP_TIMEOUT,
                /* multimodal    */ mtmd_caps{false, false},
-                /* need_download */ false,
+                // /* need_download */ false,
            };
            add_model(std::move(meta));
        }
@@ -453,6 +493,9 @@ void server_models::load_models() {
            }
        }
        for (auto & [name, inst] : mapping) {
+            if (inst.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) {
+                continue; // downloading models are not from config sources, leave them alone
+            }
            if (final_presets.find(name) == final_presets.end() && !inst.meta.is_running() && inst.th.joinable()) {
                threads_to_join.push_back(std::move(inst.th));
            }
@@ -465,7 +508,15 @@ void server_models::load_models() {

        // erase models no longer in any source
        for (auto it = mapping.begin(); it != mapping.end(); ) {
-            if (final_presets.find(it->first) == final_presets.end()) {
+            if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) {
+                ++it; // download thread is still busy, skip
+            } else if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADED) {
+                // download finished, safe to erase
+                if (it->second.th.joinable()) {
+                    it->second.th.join();
+                }
+                it = mapping.erase(it);
+            } else if (final_presets.find(it->first) == final_presets.end()) {
                SRV_INF("(reload) removing model name=%s (no longer in source)\n", it->first.c_str());
                GGML_ASSERT(!it->second.th.joinable()); // must have been joined above
                it = mapping.erase(it);
@@ -526,6 +577,7 @@ void server_models::load_models() {
        for (const auto & [name, preset] : final_presets) {
            if (mapping.find(name) == mapping.end()) {
                server_model_meta meta{
+                    /* source        */ get_source(name),
                    /* preset        */ preset,
                    /* name          */ name,
                    /* aliases       */ {},
@@ -538,7 +590,7 @@ void server_models::load_models() {
                    /* exit_code     */ 0,
                    /* stop_timeout  */ DEFAULT_STOP_TIMEOUT,
                    /* multimodal    */ mtmd_caps{false, false},
-                    /* need_download */ false,
+                    // /* need_download */ false,
                };
                add_model(std::move(meta));
                newly_added.push_back(name);
@@ -571,6 +623,8 @@ void server_models::load_models() {
            SRV_INF("(reload) loading new model %s\n", name.c_str());
            load(name);
        }
+
+        notify_sse("models_reload", "*");
    }
 }

@@ -597,7 +651,13 @@ bool server_models::has_model(const std::string & name) {
 }

 std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
-    std::lock_guard<std::mutex> lk(mutex);
+    std::unique_lock<std::mutex> lk(mutex);
+    if (need_reload) {
+        lk.unlock();
+        load_models();
+        lk.lock();
+    }
+
    auto it = mapping.find(name);
    if (it != mapping.end()) {
        return it->second.meta;
@@ -683,7 +743,13 @@ static std::vector<char *> to_char_ptr_array(const std::vector<std::string> & ve
 }

 std::vector<server_model_meta> server_models::get_all_meta() {
-    std::lock_guard<std::mutex> lk(mutex);
+    std::unique_lock<std::mutex> lk(mutex);
+    if (need_reload) {
+        lk.unlock();
+        load_models();
+        lk.lock();
+    }
+
    std::vector<server_model_meta> result;
    result.reserve(mapping.size());
    for (const auto & [name, inst] : mapping) {
@@ -770,7 +836,7 @@ void server_models::load(const std::string & name) {
        throw std::runtime_error("failed to get a port number");
    }

-    inst.subproc = std::make_shared<subprocess_s>();
+    inst.subproc = std::make_shared<server_subproc>();
    {
        SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);

@@ -792,19 +858,20 @@ void server_models::load(const std::string & name) {
        // TODO @ngxson : maybe separate stdout and stderr in the future
        //                so that we can use stdout for commands and stderr for logging
        int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
-        int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get());
+        inst.subproc->sproc.emplace();
+        int result = subprocess_create_ex(argv.data(), options, envp.data(), &inst.subproc->get());
        if (result != 0) {
            throw std::runtime_error("failed to spawn server instance");
        }

-        inst.stdin_file = subprocess_stdin(inst.subproc.get());
+        inst.stdin_file = subprocess_stdin(&inst.subproc->get());
    }

    // start a thread to manage the child process
    // captured variables are guaranteed to be destroyed only after the thread is joined
    inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port, stop_timeout = inst.meta.stop_timeout]() {
-        FILE * stdin_file = subprocess_stdin(child_proc.get());
-        FILE * stdout_file = subprocess_stdout(child_proc.get()); // combined stdout/stderr
+        FILE * stdin_file = subprocess_stdin(&child_proc->get());
+        FILE * stdout_file = subprocess_stdout(&child_proc->get()); // combined stdout/stderr

        std::thread log_thread([&]() {
            // read stdout/stderr and forward to main server log
@@ -834,14 +901,14 @@ void server_models::load(const std::string & name) {
                return this->stopping_models.find(name) != this->stopping_models.end();
            };
            auto should_wake = [&]() {
-                return is_stopping() || !subprocess_alive(child_proc.get());
+                return is_stopping() || !child_proc->is_alive();
            };
            {
                std::unique_lock<std::mutex> lk(this->mutex);
                this->cv_stop.wait(lk, should_wake);
            }
            // child may have already exited (e.g. crashed) — skip shutdown sequence
-            if (!subprocess_alive(child_proc.get())) {
+            if (!child_proc->is_alive()) {
                return;
            }
            SRV_INF("stopping model instance name=%s\n", name.c_str());
@@ -859,7 +926,7 @@ void server_models::load(const std::string & name) {
                if (elapsed >= stop_timeout * 1000) {
                    // timeout, force kill
                    SRV_WRN("force-killing model instance name=%s after %d seconds timeout\n", name.c_str(), stop_timeout);
-                    subprocess_terminate(child_proc.get());
+                    subprocess_terminate(&child_proc->get());
                    return;
                }
                this->cv_stop.wait_for(lk, std::chrono::seconds(1));
@@ -884,8 +951,8 @@ void server_models::load(const std::string & name) {

        // get the exit code
        int exit_code = 0;
-        subprocess_join(child_proc.get(), &exit_code);
-        subprocess_destroy(child_proc.get());
+        subprocess_join(&child_proc->get(), &exit_code);
+        subprocess_destroy(&child_proc->get());

        // update status and exit code
        this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
@@ -896,30 +963,118 @@ void server_models::load(const std::string & name) {
    {
        auto & old_instance = mapping[name];
        // old process should have exited already, but just in case, we clean it up here
-        if (subprocess_alive(old_instance.subproc.get())) {
+        if (old_instance.subproc->is_alive()) {
            SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str());
-            subprocess_terminate(old_instance.subproc.get()); // force kill
+            subprocess_terminate(&old_instance.subproc->get()); // force kill
        }
        if (old_instance.th.joinable()) {
            old_instance.th.join();
        }
    }

+    notify_sse("model_status", name, {
+        {"status", server_model_status_to_string(inst.meta.status)},
+    });
+
    mapping[name] = std::move(inst);
    cv.notify_all();
 }

+// callback for model downloading functionality
+struct server_models_download_res : public common_download_callback {
+    common_params_model model;
+    common_download_opts opts;
+
+    std::function<bool()> should_stop;
+    std::function<void(const common_download_progress & p)> on_progress;
+
+    bool is_ok = false;
+
+    bool run() {
+        try {
+            common_download_model(model, opts);
+            is_ok = true;
+        } catch (const std::exception & e) {
+            SRV_ERR("download failed for model name=%s: %s\n", model.name.c_str(), e.what());
+            is_ok = false;
+        }
+        return is_ok;
+    }
+    void on_start(const common_download_progress & p) override {
+        on_progress(p);
+    }
+    void on_update(const common_download_progress & p) override {
+        on_progress(p);
+    }
+    void on_done(const common_download_progress &, bool ok) override {
+        is_ok = ok;
+    }
+    bool is_cancelled() const override {
+        return should_stop();
+    }
+};
+
+void server_models::download(common_params_model && model, common_download_opts && opts) {
+    std::string name = model.name;
+    GGML_ASSERT(name == model.hf_repo);
+
+    std::unique_lock<std::mutex> lk(mutex);
+    if (mapping.find(name) != mapping.end()) {
+        throw std::runtime_error("model name=" + name + " already exists");
+    }
+
+    instance_t inst;
+    inst.meta.name   = name;
+    inst.meta.status = SERVER_MODEL_STATUS_DOWNLOADING;
+    inst.subproc     = std::make_shared<server_subproc>();
+
+    auto dl = std::make_unique<server_models_download_res>();
+    dl->model = model; // copy
+    dl->opts  = opts;  // copy
+
+    dl->should_stop = [sp = inst.subproc]() {
+        return sp->stop_download.load(std::memory_order_relaxed);
+    };
+
+    dl->on_progress = [this, name](const common_download_progress & p) {
+        update_download_progress(name, p, false);
+    };
+
+    inst.th = std::thread([this, dl = std::move(dl)]() {
+        dl->opts.callback = dl.get();
+        bool ok = dl->run();
+        SRV_INF("download finished for model name=%s with status=%s\n",
+                    dl->model.name.c_str(), ok ? "success" : "failure");
+        update_download_progress(dl->model.name, {}, true, ok);
+        // need_reload is set inside update_download_progress under the mutex;
+        // the next load_models() call will clean up this instance
+    });
+
+    mapping[name] = std::move(inst);
+    notify_sse("status_update", name, {
+        {"status", server_model_status_to_string(SERVER_MODEL_STATUS_DOWNLOADING)},
+    });
+    cv.notify_all();
+}
+
 void server_models::unload(const std::string & name) {
-    std::lock_guard<std::mutex> lk(mutex);
+    std::unique_lock<std::mutex> lk(mutex);
    auto it = mapping.find(name);
    if (it != mapping.end()) {
-        if (it->second.meta.is_running()) {
+        if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) {
+            SRV_INF("cancelling download for model name=%s\n", name.c_str());
+            it->second.subproc->stop_download.store(true, std::memory_order_relaxed);
+            // for convenience, we wait the status change here
+            wait(lk, name, [](const server_model_meta & new_meta) {
+                return new_meta.status != SERVER_MODEL_STATUS_DOWNLOADING;
+            });
+        } else if (it->second.meta.is_running()) {
            SRV_INF("stopping model instance name=%s\n", name.c_str());
            stopping_models.insert(name);
            if (it->second.meta.status == SERVER_MODEL_STATUS_LOADING) {
                // special case: if model is in loading state, unloading means force-killing it
                SRV_WRN("model name=%s is still loading, force-killing\n", name.c_str());
-                subprocess_terminate(it->second.subproc.get());
+                subprocess_terminate(&it->second.subproc->get());
            }
            cv_stop.notify_all();
            // status change will be handled by the managing thread
@@ -934,7 +1089,10 @@ void server_models::unload_all() {
    {
        std::lock_guard<std::mutex> lk(mutex);
        for (auto & [name, inst] : mapping) {
-            if (inst.meta.is_running()) {
+            if (inst.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) {
+                SRV_INF("cancelling download for model name=%s\n", name.c_str());
+                inst.subproc->stop_download.store(true, std::memory_order_relaxed);
+            } else if (inst.meta.is_running()) {
                SRV_INF("stopping model instance name=%s\n", name.c_str());
                stopping_models.insert(name);
                cv_stop.notify_all();
@@ -959,6 +1117,17 @@ void server_models::update_status(const std::string & name, server_model_status
        meta.status    = status;
        meta.exit_code = exit_code;
    }
+    // broadcast status change to SSE
+    {
+        json data = {
+            {"status", server_model_status_to_string(status)},
+        };
+        if (status == SERVER_MODEL_STATUS_UNLOADED) {
+            data["exit_code"] = exit_code;
+        }
+        // note: notify_sse doesn't acquire the lock, so no deadlock here
+        notify_sse("status_change", name, data);
+    }
    cv.notify_all();
 }

@@ -985,12 +1154,82 @@ void server_models::update_loaded_info(const std::string & name, std::string & r
    cv.notify_all();
 }

-void server_models::wait_until_loading_finished(const std::string & name) {
-    std::unique_lock<std::mutex> lk(mutex);
-    cv.wait(lk, [this, &name]() {
+void server_models::update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok) {
+    json curr;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
        auto it = mapping.find(name);
        if (it != mapping.end()) {
-            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
+            if (done) {
+                // mark the instance to be erased on next load_models() call
+                it->second.meta.status = SERVER_MODEL_STATUS_DOWNLOADED;
+                need_reload = true;
+            } else {
+                json & info = it->second.meta.loaded_info;
+                if (!info.contains("progress")) {
+                    info["progress"] = json{};
+                }
+                info["progress"][progress.url] = {
+                    {"done",  progress.downloaded},
+                    {"total", progress.total},
+                };
+                curr = it->second.meta.loaded_info; // copy
+            }
+        }
+    }
+    if (done) {
+        cv.notify_all(); // notify in case unload() is waiting for download to be cancelled
+        notify_sse(ok ? "download_finished" : "download_failed", name, {});
+    } else {
+        notify_sse("download_progress", name, curr);
+    }
+}
+
+bool server_models::remove(const std::string & name) {
+    auto meta = get_meta(name);
+
+    if (!meta.has_value()) {
+        throw std::runtime_error("model name=" + name + " is not found");
+    }
+    if (meta->source != SERVER_MODEL_SOURCE_CACHE) {
+        throw std::runtime_error("model name=" + name + " is not removable (not from cache)");
+    }
+
+    unload(name); // cancel download or stop running instance
+    {
+        std::unique_lock<std::mutex> lk(mutex);
+        // a cancelled download lands on DOWNLOADED; a stopped instance lands on UNLOADED
+        wait(lk, name, [](const server_model_meta & new_meta) {
+            return new_meta.status == SERVER_MODEL_STATUS_UNLOADED
+                || new_meta.status == SERVER_MODEL_STATUS_DOWNLOADED;
+        });
+        // join before erasing - after status reaches UNLOADED/DOWNLOADED the thread no
+        // longer acquires this mutex, so joining while holding it is safe
+        if (mapping[name].th.joinable()) {
+            mapping[name].th.join();
+        }
+        // remove the model from disk (hold lock to prevent concurrent load)
+        bool ok = common_download_remove(name);
+        if (ok) {
+            mapping.erase(name);
+        }
+        SRV_INF("removing model name=%s from cache (%s)\n", name.c_str(), ok ? "succeeded" : "failed");
+        notify_sse("model_remove", name, {});
+        return ok;
+    }
+}
+
+void server_models::wait(const std::string & name, std::function<bool(const server_model_meta &)> predicate) {
+    std::unique_lock<std::mutex> lk(mutex);
+    wait(lk, name, predicate);
+}
+
+void server_models::wait(std::unique_lock<std::mutex> & lk, const std::string & name, std::function<bool(const server_model_meta &)> predicate) {
+    cv.wait(lk, [this, &name, &predicate]() {
+        auto it = mapping.find(name);
+        if (it != mapping.end()) {
+            return predicate(it->second.meta);
+
        }
        return false;
    });
@@ -1014,10 +1253,15 @@ bool server_models::ensure_model_ready(const std::string & name) {

    // wait for loading to complete
    SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
-    wait_until_loading_finished(name);
+    wait(name, [&meta](const server_model_meta & new_meta) {
+        if (new_meta.status != SERVER_MODEL_STATUS_LOADING) {
+            meta = new_meta; // update meta for final check after wait
+            return true;
+        }
+        return false;
+    });

    // check final status
-    meta = get_meta(name);
    if (!meta.has_value() || meta->is_failed()) {
        throw std::runtime_error("model name=" + name + " failed to load");
    }
@@ -1111,6 +1355,42 @@ void server_models::notify_router_sleeping_state(bool is_sleeping) {
 // server_models_routes
 //

+// RAII wrapper similar to server_response_reader, but doesn't use server_queue
+static std::atomic<int> sse_client_id_counter = 0;
+struct server_models_sse_client {
+    server_response & queue_results;
+    int client_id;
+    server_models_sse_client(server_response & q)
+            : queue_results(q), client_id(sse_client_id_counter.fetch_add(1, std::memory_order_relaxed)) {
+        SRV_DBG("new SSE client connected, assigned client_id=%d\n", client_id);
+        queue_results.add_waiting_task_id(client_id);
+    }
+    ~server_models_sse_client() {
+        SRV_DBG("SSE client disconnected, removing client_id=%d\n", client_id);
+        queue_results.remove_waiting_task_id(client_id);
+    }
+
+    // return nullptr if should_stop() is true before receiving a result
+    // note: if one error is received, it will stop further processing and return error result
+    server_task_result_ptr next(const std::function<bool()> & should_stop) {
+        while (true) {
+            static const int http_polling_seconds = 1; // check should_stop every 1 second
+            server_task_result_ptr result = queue_results.recv_with_timeout({client_id}, http_polling_seconds);
+            if (result == nullptr) {
+                // timeout, check stop condition
+                if (should_stop()) {
+                    return nullptr;
+                }
+                // continue waiting otherwise
+            } else {
+                SRV_DBG("recv result for client_id=%d: %s\n", client_id, safe_json_to_str(result->to_json()).c_str());
+                return result;
+            }
+        }
+        // should not reach here
+    }
+};
+
 static void res_ok(std::unique_ptr<server_http_res> & res, const json & response_data) {
    res->status = 200;
    res->data = safe_json_to_str(response_data);
@@ -1274,7 +1554,9 @@ void server_models_routes::init_routes() {
                {"created",       t},          // for OAI-compat
                {"status",        status},
                {"architecture",  architecture},
-                {"need_download", meta.need_download},
+                {"source",        server_model_source_to_string(meta.source)},
+                {"can_remove",    meta.source == SERVER_MODEL_SOURCE_CACHE},
+                // {"need_download", meta.need_download},
                // TODO: add other fields, may require reading GGUF metadata
            };

@@ -1312,6 +1594,87 @@ void server_models_routes::init_routes() {
        res_ok(res, {{"success", true}});
        return res;
    };
+
+    this->get_router_models_sse = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+        res->status = 200;
+        res->content_type = "text/event-stream";
+        auto sse_client = std::make_shared<server_models_sse_client>(models.sse);
+        res->next = [this, sse_client, &req](std::string & output) -> bool {
+            auto result = sse_client->next([&]() {
+                return stopping.load(std::memory_order_relaxed) || req.should_stop();
+            });
+            if (result == nullptr) {
+                return false; // client disconnected or should_stop
+            }
+            output = "data: " + safe_json_to_str(result->to_json()) + "\n\n";
+            return true; // listen for the next event
+        };
+        return res;
+    };
+
+    this->post_router_models = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+
+        json body = json::parse(req.body);
+        std::string name = json_value(body, "model", std::string());
+        if (name.empty()) {
+            throw std::invalid_argument("model must be a non-empty string");
+        }
+
+        common_params_model model;
+        common_download_opts opts;
+
+        model.name           = name;
+        model.hf_repo        = name;
+        opts.bearer_token    = params.hf_token;
+        opts.download_mmproj = true;
+        opts.download_mtp    = true;
+
+        // first, only check if the model is valid and can be downloaded
+        opts.skip_download = true;
+        bool ok = false;
+        try {
+            auto validation = common_download_model(model, opts);
+            ok = !validation.model_path.empty();
+        } catch (const common_skip_download_exception &) {
+            // model is valid and will be downloaded
+            ok = true;
+        } catch (...) {
+            SRV_ERR("unknown error while validating model '%s'\n", name.c_str());
+            // other exceptions will be handled by the outer ex_wrapper()
+            throw;
+        }
+
+        if (!ok) {
+            throw std::invalid_argument("model validation failed, unable to download");
+        }
+
+        // then, proceed with the actual download
+        opts.skip_download = false;
+        SRV_INF("starting download for model '%s'\n", name.c_str());
+        models.download(std::move(model), std::move(opts));
+
+        res_ok(res, {{"success", true}});
+        return res;
+    };
+
+    this->del_router_models = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+
+        std::string name = req.get_param("model");
+        if (name.empty()) {
+            throw std::invalid_argument("model must be a non-empty string");
+        }
+
+        bool ok = models.remove(name);
+        if (!ok) {
+            throw std::runtime_error("failed to remove model '" + name + "'");
+        }
+
+        res_ok(res, {{"success", true}});
+        return res;
+    };
 }


@@ -1,9 +1,11 @@
 #pragma once

 #include "common.h"
+#include "download.h"
 #include "preset.h"
 #include "server-common.h"
 #include "server-http.h"
+#include "server-queue.h"

 #include <mutex>
 #include <condition_variable>
@@ -14,6 +16,8 @@
 /**
 * state diagram:
 *
+ * DOWNLOADING ──► DOWNLOADED ──► (replaced by new instance)
+ *
 * UNLOADED ──► LOADING ──► LOADED ◄──── SLEEPING
 *  ▲            │            │               ▲
 *  └───failed───┘            │               │
@@ -22,39 +26,43 @@
 */
 enum server_model_status {
    // TODO: also add downloading state when the logic is added
+    SERVER_MODEL_STATUS_DOWNLOADING,
+    SERVER_MODEL_STATUS_DOWNLOADED,
    SERVER_MODEL_STATUS_UNLOADED,
    SERVER_MODEL_STATUS_LOADING,
    SERVER_MODEL_STATUS_LOADED,
    SERVER_MODEL_STATUS_SLEEPING
 };

-static server_model_status server_model_status_from_string(const std::string & status_str) {
-    if (status_str == "unloaded") {
-        return SERVER_MODEL_STATUS_UNLOADED;
-    }
-    if (status_str == "loading") {
-        return SERVER_MODEL_STATUS_LOADING;
-    }
-    if (status_str == "loaded") {
-        return SERVER_MODEL_STATUS_LOADED;
-    }
-    if (status_str == "sleeping") {
-        return SERVER_MODEL_STATUS_SLEEPING;
-    }
-    throw std::runtime_error("invalid server model status");
-}
+enum server_model_source {
+    SERVER_MODEL_SOURCE_PRESET,
+    SERVER_MODEL_SOURCE_MODELS_DIR,
+    SERVER_MODEL_SOURCE_CACHE,
+};

 static std::string server_model_status_to_string(server_model_status status) {
    switch (status) {
-        case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
-        case SERVER_MODEL_STATUS_LOADING:  return "loading";
-        case SERVER_MODEL_STATUS_LOADED:   return "loaded";
-        case SERVER_MODEL_STATUS_SLEEPING: return "sleeping";
-        default:                           return "unknown";
+        case SERVER_MODEL_STATUS_DOWNLOADING: return "downloading";
+        case SERVER_MODEL_STATUS_DOWNLOADED:  return "downloaded";
+        case SERVER_MODEL_STATUS_UNLOADED:    return "unloaded";
+        case SERVER_MODEL_STATUS_LOADING:     return "loading";
+        case SERVER_MODEL_STATUS_LOADED:      return "loaded";
+        case SERVER_MODEL_STATUS_SLEEPING:    return "sleeping";
+        default:                              return "unknown";
+    }
+}
+
+static std::string server_model_source_to_string(server_model_source source) {
+    switch (source) {
+        case SERVER_MODEL_SOURCE_PRESET:     return "preset";
+        case SERVER_MODEL_SOURCE_MODELS_DIR: return "models_dir";
+        case SERVER_MODEL_SOURCE_CACHE:      return "cache";
+        default:                             return "unknown";
    }
 }

 struct server_model_meta {
+    server_model_source source = SERVER_MODEL_SOURCE_CACHE;
    common_preset preset;
    std::string name;
    std::set<std::string> aliases; // additional names that resolve to this model
@@ -63,11 +71,11 @@ struct server_model_meta {
    server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
    int64_t last_used = 0; // for LRU unloading
    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
-    json loaded_info; // info to be reflected via /v1/models endpoint
+    json loaded_info; // info to be reflected via /v1/models endpoint ; if in DOWNLOADING state, it should contain download progress info
    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
    mtmd_caps multimodal; // multimodal capabilities
-    bool need_download = false; // whether the model needs to be downloaded before loading
+    // bool need_download = false; // whether the model needs to be downloaded before loading // TODO @ngxson: implement this

    bool is_ready() const {
        return status == SERVER_MODEL_STATUS_LOADED;
@@ -85,12 +93,15 @@ struct server_model_meta {
    void update_caps();
 };

-struct subprocess_s;
+struct server_models_routes;
+struct server_subproc; // defined in server-models.cpp

 struct server_models {
+    friend struct server_models_routes;
+
 private:
    struct instance_t {
-        std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
+        std::shared_ptr<server_subproc> subproc; // shared between main thread and monitoring thread
        std::thread th;
        server_model_meta meta;
        FILE * stdin_file = nullptr;
@@ -107,6 +118,9 @@ private:
    // set to true while load_models() is executing a reload; load() will wait until clear
    bool is_reloading = false;

+    // if true, the next get_meta() will trigger a reload of model list
+    bool need_reload = false;
+
    common_preset_context ctx_preset;

    common_params base_params;
@@ -122,9 +136,14 @@ private:
    // not thread-safe, caller must hold mutex
    void add_model(server_model_meta && meta);

+    // notify SSE clients
+    void notify_sse(const std::string & event, const std::string & model_id, const json & data = nullptr);
+
 public:
    server_models(const common_params & params, int argc, char ** argv);

+    server_response sse; // for real-time updates via SSE endpoint
+
    // (re-)load the list of models from various sources and prepare the metadata mapping
    // - if this is called the first time, simply populate the metadata
    // - if this is called subsequently (e.g. when refreshing from disk):
@@ -147,13 +166,24 @@ public:
    void unload(const std::string & name);
    void unload_all();

+    // download a new model, progress is reported via SSE
+    // to stop the download, call unload()
+    void download(common_params_model && model, common_download_opts && opts);
+
    // update the status of a model instance (thread-safe)
    void update_status(const std::string & name, server_model_status status, int exit_code);
    void update_loaded_info(const std::string & name, std::string & raw_info);
+    void update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok = true);
+
+    // remove a cache model from disk and update the list (thread-safe)
+    // note: only cache models can be removed; returns false if the model doesn't exist or is not a cache model
+    bool remove(const std::string & name);

    // wait until the model instance is fully loaded (thread-safe)
+    // note: predicate is called while holding the lock
    // return when the model no longer in "loading" state
-    void wait_until_loading_finished(const std::string & name);
+    void wait(const std::string & name, std::function<bool(const server_model_meta &)> predicate);
+    void wait(std::unique_lock<std::mutex> & lk, const std::string & name, std::function<bool(const server_model_meta &)> predicate);

    // ensure the model is in ready state (thread-safe)
    // return false if model is ready
@@ -176,8 +206,9 @@ public:

 struct server_models_routes {
    common_params params;
-    json ui_settings = json::object();          // Primary: new name
-    json webui_settings = json::object();        // Deprecated: use ui_settings (kept for compat)
+    json ui_settings = json::object();     // Primary: new name
+    json webui_settings = json::object();  // Deprecated: use ui_settings (kept for compat)
+    std::atomic<bool> stopping = false;    // for graceful disconnecting SSE clients during shutdown
    server_models models;
    server_models_routes(const common_params & params, int argc, char ** argv)
            : params(params), models(params, argc, argv) {
@@ -206,6 +237,10 @@ struct server_models_routes {
    server_http_context::handler_t get_router_models;
    server_http_context::handler_t post_router_models_load;
    server_http_context::handler_t post_router_models_unload;
+    // management API
+    server_http_context::handler_t get_router_models_sse;
+    server_http_context::handler_t post_router_models;
+    server_http_context::handler_t del_router_models;
 };

 /**
@@ -331,6 +331,17 @@ void server_response::send(server_task_result_ptr && result) {
    }
 }

+void server_response::broadcast(server_task_result_ptr && result) {
+    std::unique_lock<std::mutex> lock(mutex_results);
+    for (const auto & id_task : waiting_task_ids) {
+        RES_DBG("task id = %d pushed to result queue\n", id_task);
+        server_task_result_ptr res_copy(result->clone());
+        res_copy->id = id_task; // override id with target task id
+        queue_results.emplace_back(std::move(res_copy));
+    }
+    condition_results.notify_all();
+}
+
 void server_response::terminate() {
    running = false;
    condition_results.notify_all();
@@ -154,11 +154,15 @@ public:
    // Send a new result to a waiting id_task
    void send(server_task_result_ptr && result);

+    // broadcast a new result to all waiting tasks
+    // (used by router mode)
+    void broadcast(server_task_result_ptr && result);
+
    // terminate the waiting loop
    void terminate();
 };

-// utility class to make working with server_queue and server_response easier
+// RAII wrapper to make working with server_queue and server_response easier
 // it provides a generator-like API for server responses
 // support pooling connection state and aggregating multiple results
 struct server_response_reader {
@@ -312,6 +312,9 @@ struct server_task_result {
    }
    virtual json to_json() = 0;
    virtual ~server_task_result() = default;
+    virtual server_task_result * clone() const {
+        GGML_ABORT("not implemented for this task type");
+    }
 };

 // using shared_ptr for polymorphism of server_task_result
@@ -649,3 +652,12 @@ struct server_prompt_cache {

    void update();
 };
+
+// used exclusively by router mode
+struct server_task_result_router : server_task_result {
+    json data;
+    virtual json to_json() override { return data; }
+    virtual server_task_result * clone() const override {
+        return new server_task_result_router(*this);
+    }
+};
@@ -174,8 +174,11 @@ int llama_server(int argc, char ** argv) {
        routes.get_props                   = models_routes->get_router_props;
        routes.get_models                  = models_routes->get_router_models;

+        ctx_http.post("/models",               ex_wrapper(models_routes->post_router_models));
        ctx_http.post("/models/load",          ex_wrapper(models_routes->post_router_models_load));
        ctx_http.post("/models/unload",        ex_wrapper(models_routes->post_router_models_unload));
+        ctx_http.get ("/models/sse",           ex_wrapper(models_routes->get_router_models_sse));
+        ctx_http.del ("/models",               ex_wrapper(models_routes->del_router_models));
    }

    ctx_http.get ("/health",                   ex_wrapper(routes.get_health)); // public endpoint (no API key check)
@@ -261,6 +264,7 @@ int llama_server(int argc, char ** argv) {
        clean_up = [&models_routes]() {
            SRV_INF("%s: cleaning up before exit...\n", __func__);
            if (models_routes.has_value()) {
+                models_routes->stopping.store(true); // maybe redundant, but just to be safe
                models_routes->models.unload_all();
            }
            llama_backend_free();
@@ -274,6 +278,10 @@ int llama_server(int argc, char ** argv) {
        ctx_http.is_ready.store(true);

        shutdown_handler = [&](int) {
+            if (models_routes.has_value()) {
+                // important to disconnect any SSE clients
+                models_routes->stopping.store(true);
+            }
            ctx_http.stop();
        };

@@ -1,3 +1,4 @@
+import threading
 import pytest
 from utils import *

@@ -253,3 +254,98 @@ def test_router_reload_models():
        assert "model-reload-c" in ids, "newly added model should appear"
    finally:
        os.remove(preset_path)
+
+
+MODEL_DOWNLOAD_ID = "ggml-org/test-model-router-download:F16"
+MODEL_DOWNLOAD_TIMEOUT = 300
+
+
+def _listen_sse(server: ServerProcess, collected: list, stop: threading.Event):
+    """Collect /models/sse events into `collected` until `stop` is set."""
+    url = f"http://{server.server_host}:{server.server_port}/models/sse"
+    try:
+        with requests.get(url, stream=True, timeout=MODEL_DOWNLOAD_TIMEOUT) as resp:
+            for line_bytes in resp.iter_lines():
+                if stop.is_set():
+                    break
+                line = line_bytes.decode("utf-8")
+                if line.startswith("data: "):
+                    collected.append(json.loads(line[6:]))
+    except Exception:
+        pass
+
+
+def _wait_for_sse_event(collected: list, event_type: str, model: str, timeout: int) -> bool:
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        if any(e.get("event") == event_type and e.get("model") == model for e in collected):
+            return True
+        time.sleep(0.5)
+    return False
+
+
+def test_router_download_model():
+    """Case 1: download a model, verify SSE events and GET /models."""
+    global server
+    server.start()
+
+    # Ensure the model is not present before we start
+    server.make_request("DELETE", f"/models?model={MODEL_DOWNLOAD_ID}")
+
+    sse_events: list = []
+    stop = threading.Event()
+    sse_thread = threading.Thread(
+        target=_listen_sse, args=(server, sse_events, stop), daemon=True
+    )
+    sse_thread.start()
+
+    # Trigger the download
+    res = server.make_request("POST", "/models", data={"model": MODEL_DOWNLOAD_ID})
+    assert res.status_code == 200
+    assert res.body.get("success") is True
+
+    # Wait for download_finished SSE event
+    finished = _wait_for_sse_event(
+        sse_events, "download_finished", MODEL_DOWNLOAD_ID, MODEL_DOWNLOAD_TIMEOUT
+    )
+    stop.set()
+
+    assert finished, "Never received download_finished SSE event"
+    assert any(
+        e.get("event") == "download_progress" and e.get("model") == MODEL_DOWNLOAD_ID
+        for e in sse_events
+    ), "No download_progress events received"
+
+    # Model should now appear in GET /models
+    ids = _get_model_ids(is_reload=False)
+    assert MODEL_DOWNLOAD_ID in ids, f"{MODEL_DOWNLOAD_ID} not found in /models after download"
+
+
+def test_router_delete_model():
+    """Case 2: delete the downloaded model, verify it disappears from GET /models."""
+    global server
+    server.start()
+
+    # Ensure the model exists (download it if needed)
+    if MODEL_DOWNLOAD_ID not in _get_model_ids(is_reload=False):
+        res = server.make_request("POST", "/models", data={"model": MODEL_DOWNLOAD_ID})
+        assert res.status_code == 200
+        sse_events: list = []
+        stop = threading.Event()
+        threading.Thread(
+            target=_listen_sse, args=(server, sse_events, stop), daemon=True
+        ).start()
+        finished = _wait_for_sse_event(
+            sse_events, "download_finished", MODEL_DOWNLOAD_ID, MODEL_DOWNLOAD_TIMEOUT
+        )
+        stop.set()
+        assert finished, "Model did not finish downloading before delete test"
+
+    # Delete the model
+    del_res = server.make_request("DELETE", f"/models?model={MODEL_DOWNLOAD_ID}")
+    assert del_res.status_code == 200
+    assert del_res.body.get("success") is True
+
+    # Model should no longer appear in GET /models
+    ids = _get_model_ids(is_reload=False)
+    assert MODEL_DOWNLOAD_ID not in ids, f"{MODEL_DOWNLOAD_ID} still present after deletion"
@@ -340,6 +340,9 @@ class ServerProcess:
        elif method == "POST":
            response = requests.post(url, headers=headers, json=data, timeout=timeout)
            parse_body = True
+        elif method == "DELETE":
+            response = requests.delete(url, headers=headers, timeout=timeout)
+            parse_body = True
        elif method == "OPTIONS":
            response = requests.options(url, headers=headers, timeout=timeout)
        else:
@@ -27,7 +27,7 @@ echo "Running pre-commit checks for llama-ui..."
 # Format only staged files
 staged_ui=$(git diff --cached --name-only -- tools/ui/)
 if [ -n "$staged_ui" ]; then
-    echo "$staged_ui" | xargs npx --no-install prettier --write
+    echo "$staged_ui" | xargs npm run format
    format_ok=$?
    # Re-stage formatted files
    git add tools/ui/
@@ -57,6 +57,7 @@ if [ $lint_ok -ne 0 ]; then
    echo "❌ Lint failed"
    exit 1
 fi
+
 if [ $test_ok -ne 0 ]; then
    echo "❌ Tests failed"
    exit 1
Author	SHA1	Message	Date
Neo Zhang	9724f664e8	[SYCL] rename GGML_SYCL_SUPPORT_LEVEL_ZERO (#24719 ) * rename GGML_SYCL_SUPPORT_LEVEL_ZERO to GGML_SYCL_SUPPORT_LEVEL_ZERO_API, and GGML_SYCL_ENABLE_LEVEL_ZERO to GGML_SYCL_USE_LEVEL_ZERO_API * fix code format * fix error when rebase	2026-06-18 11:18:26 +03:00
Neo Zhang	dd69db2924	sycl : support MUL_MAT and OUT_PROD with Q1_0 (#24721 )	2026-06-18 11:17:37 +03:00
Adrien Gallouët	6ec59ddaea	app : enable self-update only when built with llama-install.sh (#24754 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-18 09:57:59 +02:00
Sigbjørn Skjæret	32e806b9c1	ci : fix check-release message parsing (#24751 )	2026-06-18 09:32:56 +02:00
Neo Zhang	6f1034b32a	[SYCL] support OPs: conv_2d, conv_2d_dw, conv2d_transpose (#24600 ) * fix conflict * fix format issue, rename * rm debug code * correct the file name	2026-06-18 09:40:03 +03:00
Aleksander Grygier	0b73fc79fe	ui: Update code formatting command in pre-commit hook (#24685 )	2026-06-18 08:33:50 +02:00
Ravi Panchumarthy	4a79037b8b	ci : fix Windows x64 (OpenVINO) release link (#24731 )	2026-06-18 08:30:08 +02:00
Georgi Gerganov	cae0a3b0b0	metal : check for BF16 support in concat kernel (#24747 )	2026-06-18 09:16:06 +03:00
Xuan-Son Nguyen	f3e1828164	mtmd: llava_uhd should no longer use batch dim (#24732 )	2026-06-17 22:40:50 +02:00
shalinib-ibm	2e88c49c90	ggml-cpu: Conditionally enable power11 backend based on compiler support (#24687 ) * ggml: Conditionally enable power11 backend based on compiler support Guard POWER11 backend creation behind a compiler flag check for -mcpu=power11. This avoids build failures on current GCC/Clang toolchains while preserving forward compatibility once POWER11 support becomes available. * Update CMakeLists.txt ggml-cpu: Use -mcpu=power10 for P10 and P11	2026-06-18 02:45:19 +08:00
Georgi Gerganov	0843245cb1	metal : implement rope_back operator (#24725 ) Reuse existing rope kernels with a function constant to toggle forward/backward rotation, avoiding duplicate kernel code. Assisted-by: pi:llama.cpp/Qwen3.6-27B	2026-06-17 20:36:05 +03:00
Georgi Gerganov	8d2e580632	metal : add f16 and bf16 support for concat operator (#24724 ) * metal : add f16 and bf16 support for concat operator Extend the Metal backend concat operator to support f16 and bf16 tensor types in addition to the existing f32 and i32 support. - Template kernel_concat on type T with specializations for float, half, bfloat, and int - Add type-specific pipeline getter ggml_metal_library_get_pipeline_concat() - Update device support check to allow f16 unconditionally and bf16 when device supports bfloat16 - Update dispatch to select the correct kernel specialization by type Assisted-by: pi:llama.cpp/Qwen3.6-27B * metal : extend concat operator to support f16, bf16, i8, i16 and i64 Assisted-by: pi:llama.cpp/Qwen3.6-27B	2026-06-17 19:38:55 +03:00
Xuan-Son Nguyen	4b4d13ae72	server: (router) add model management API (#23976 ) * wip * server: (router) add SSE realtime updates API * nits * wip * add download API * add download api * update docs * add delete endpoint * fix std::terminate * fix crash * fix 2 * add tests * nits	2026-06-17 18:04:58 +02:00
Dev-iL	b4024af6c2	llama : skip main_gpu validation when no devices are available (#23405 )	2026-06-17 17:30:26 +03:00
Ruixiang Wang	1a2dea29b9	spec: fix segfault error on long prompts for eagle3 (#24707 )	2026-06-17 17:29:49 +03:00