Compare commits

..

15 Commits

Author SHA1 Message Date
Neo Zhang 9724f664e8 [SYCL] rename GGML_SYCL_SUPPORT_LEVEL_ZERO (#24719)
* rename GGML_SYCL_SUPPORT_LEVEL_ZERO to GGML_SYCL_SUPPORT_LEVEL_ZERO_API, and GGML_SYCL_ENABLE_LEVEL_ZERO to  GGML_SYCL_USE_LEVEL_ZERO_API

* fix code format

* fix error when rebase
2026-06-18 11:18:26 +03:00
Neo Zhang dd69db2924 sycl : support MUL_MAT and OUT_PROD with Q1_0 (#24721) 2026-06-18 11:17:37 +03:00
Adrien Gallouët 6ec59ddaea app : enable self-update only when built with llama-install.sh (#24754)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-18 09:57:59 +02:00
Sigbjørn Skjæret 32e806b9c1 ci : fix check-release message parsing (#24751) 2026-06-18 09:32:56 +02:00
Neo Zhang 6f1034b32a [SYCL] support OPs: conv_2d, conv_2d_dw, conv2d_transpose (#24600)
* fix conflict

* fix format issue, rename

* rm debug code

* correct the file name
2026-06-18 09:40:03 +03:00
Aleksander Grygier 0b73fc79fe ui: Update code formatting command in pre-commit hook (#24685) 2026-06-18 08:33:50 +02:00
Ravi Panchumarthy 4a79037b8b ci : fix Windows x64 (OpenVINO) release link (#24731) 2026-06-18 08:30:08 +02:00
Georgi Gerganov cae0a3b0b0 metal : check for BF16 support in concat kernel (#24747) 2026-06-18 09:16:06 +03:00
Xuan-Son Nguyen f3e1828164 mtmd: llava_uhd should no longer use batch dim (#24732) 2026-06-17 22:40:50 +02:00
shalinib-ibm 2e88c49c90 ggml-cpu: Conditionally enable power11 backend based on compiler support (#24687)
* ggml: Conditionally enable power11 backend based on compiler support

Guard POWER11 backend creation behind a compiler flag check for -mcpu=power11. This avoids build failures on current GCC/Clang toolchains while preserving forward compatibility once POWER11 support becomes available.

* Update CMakeLists.txt

ggml-cpu: Use -mcpu=power10 for P10 and P11
2026-06-18 02:45:19 +08:00
Georgi Gerganov 0843245cb1 metal : implement rope_back operator (#24725)
Reuse existing rope kernels with a function constant to toggle forward/backward
rotation, avoiding duplicate kernel code.

Assisted-by: pi:llama.cpp/Qwen3.6-27B
2026-06-17 20:36:05 +03:00
Georgi Gerganov 8d2e580632 metal : add f16 and bf16 support for concat operator (#24724)
* metal : add f16 and bf16 support for concat operator

Extend the Metal backend concat operator to support f16 and bf16 tensor
types in addition to the existing f32 and i32 support.

- Template kernel_concat on type T with specializations for float, half,
  bfloat, and int
- Add type-specific pipeline getter ggml_metal_library_get_pipeline_concat()
- Update device support check to allow f16 unconditionally and bf16 when
  device supports bfloat16
- Update dispatch to select the correct kernel specialization by type

Assisted-by: pi:llama.cpp/Qwen3.6-27B

* metal : extend concat operator to support f16, bf16, i8, i16 and i64

Assisted-by: pi:llama.cpp/Qwen3.6-27B
2026-06-17 19:38:55 +03:00
Xuan-Son Nguyen 4b4d13ae72 server: (router) add model management API (#23976)
* wip

* server: (router) add SSE realtime updates API

* nits

* wip

* add download API

* add download api

* update docs

* add delete endpoint

* fix std::terminate

* fix crash

* fix 2

* add tests

* nits
2026-06-17 18:04:58 +02:00
Dev-iL b4024af6c2 llama : skip main_gpu validation when no devices are available (#23405) 2026-06-17 17:30:26 +03:00
Ruixiang Wang 1a2dea29b9 spec: fix segfault error on long prompts for eagle3 (#24707) 2026-06-17 17:29:49 +03:00
57 changed files with 3435 additions and 1769 deletions
+4 -1
View File
@@ -46,11 +46,13 @@ jobs:
steps:
- id: check
env:
COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "should_release=true" >> $GITHUB_OUTPUT
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
if echo "$COMMIT_MESSAGE" | grep -q '\[no release\]'; then
echo "should_release=false" >> $GITHUB_OUTPUT
else
echo "should_release=true" >> $GITHUB_OUTPUT
@@ -542,6 +544,7 @@ jobs:
steps:
- name: Set OpenVINO version output
id: openvino_version
shell: bash
run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
- name: Clone
+26 -13
View File
@@ -20,16 +20,21 @@ int llama_fit_params(int argc, char ** argv);
int llama_quantize(int argc, char ** argv);
int llama_perplexity(int argc, char ** argv);
// hands the update over to the install script, which downloads and swaps the binary
// Self-update is only supported for binaries built with llama-install.sh
static int llama_update(int argc, char ** argv) {
(void) argc;
(void) argv;
#ifdef LLAMA_INSTALL_BUILD
#if defined(_WIN32)
return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
#else
return system("curl -fsSL https://llama.app/install.sh | sh");
#endif
#else
printf("Updates are available only when installed from https://llama.app\n");
return 1;
#endif
}
static const char * progname;
@@ -46,21 +51,29 @@ struct command {
int (*func)(int, char **);
};
#ifdef LLAMA_INSTALL_BUILD
#define UPDATE_HIDDEN false
#else
#define UPDATE_HIDDEN true
#endif
static const command cmds[] = {
{"serve", "HTTP API server", {"server"}, false, llama_server },
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
{"update", "Update llama to the latest release", {}, false, llama_update },
{"completion", "Text completion", {"complete"}, true, llama_completion },
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
{"quantize", "Quantize a model", {}, true, llama_quantize },
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
{"version", "Show version", {}, false, version },
{"licenses", "Show third-party licenses", {"credits"}, false, licenses },
{"help", "Show available commands", {}, false, help },
{"serve", "HTTP API server", {"server"}, false, llama_server },
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
{"update", "Update llama to the latest release", {}, UPDATE_HIDDEN, llama_update },
{"completion", "Text completion", {"complete"}, true, llama_completion },
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
{"quantize", "Quantize a model", {}, true, llama_quantize },
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
{"version", "Show version", {}, false, version },
{"licenses", "Show third-party licenses", {"credits"}, false, licenses },
{"help", "Show available commands", {}, false, help },
};
#undef UPDATE_HIDDEN
static int version(int argc, char ** argv) {
printf("%s\n", llama_build_info());
return 0;
+84
View File
@@ -997,3 +997,87 @@ std::vector<common_cached_model_info> common_list_cached_models() {
return result;
}
bool common_download_remove(const std::string & hf_repo_with_tag) {
namespace fs = std::filesystem;
auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag);
if (tag.empty()) {
return hf_cache::remove_cached_repo(repo_id);
}
std::string tag_upper = tag;
for (char & c : tag_upper) {
c = (char) std::toupper((unsigned char) c);
}
auto files = hf_cache::get_cached_files(repo_id);
if (files.empty()) {
return false;
}
// collect snapshot entries whose tag matches
std::vector<fs::path> to_remove;
for (const auto & f : files) {
auto split = get_gguf_split_info(f.path);
if (split.tag == tag_upper) {
to_remove.emplace_back(f.local_path);
}
}
if (to_remove.empty()) {
return false;
}
// resolve blob paths from symlinks before deleting snapshot entries
std::vector<fs::path> blobs_to_check;
for (const auto & p : to_remove) {
std::error_code ec;
if (fs::is_symlink(p, ec)) {
auto target = fs::read_symlink(p, ec);
if (!ec) {
blobs_to_check.push_back((p.parent_path() / target).lexically_normal());
}
}
}
// remove snapshot entries
for (const auto & p : to_remove) {
std::error_code ec;
fs::remove(p, ec);
if (ec) {
LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str());
}
}
if (blobs_to_check.empty()) {
return true;
}
// collect blobs still referenced by remaining snapshot entries
std::unordered_set<std::string> still_referenced;
for (const auto & f : hf_cache::get_cached_files(repo_id)) {
fs::path p(f.local_path);
std::error_code ec;
if (fs::is_symlink(p, ec)) {
auto target = fs::read_symlink(p, ec);
if (!ec) {
still_referenced.insert((p.parent_path() / target).lexically_normal().string());
}
}
}
// remove orphaned blobs
for (const auto & blob : blobs_to_check) {
if (still_referenced.find(blob.string()) == still_referenced.end()) {
std::error_code ec;
fs::remove(blob, ec);
if (ec) {
LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str());
}
}
}
return true;
}
+7
View File
@@ -115,3 +115,10 @@ int common_download_file_single(const std::string & url,
// resolve and download model from Docker registry
// return local path to downloaded model file
std::string common_docker_resolve_model(const std::string & docker);
// Remove a cached model from disk
// input format: "user/model" or "user/model:tag"
// - if tag is omitted, removes the entire repo cache directory
// - if tag is present, removes only files matching that tag (and orphaned blobs)
// returns true if anything was removed
bool common_download_remove(const std::string & hf_repo_with_tag);
+15
View File
@@ -495,4 +495,19 @@ std::string finalize_file(const hf_file & file) {
return file.final_path;
}
bool remove_cached_repo(const std::string & repo_id) {
if (!is_valid_repo_id(repo_id)) {
LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
return false;
}
fs::path repo_path = get_repo_path(repo_id);
std::error_code ec;
auto removed = fs::remove_all(repo_path, ec);
if (ec) {
LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str());
return false;
}
return removed > 0;
}
} // namespace hf_cache
+3
View File
@@ -29,4 +29,7 @@ hf_files get_cached_files(const std::string & repo_id = {});
// Create snapshot path (link or move/copy) and return it
std::string finalize_file(const hf_file & file);
// Remove the entire cached directory for a repo, returns true if removed
bool remove_cached_repo(const std::string & repo_id);
} // namespace hf_cache
+60 -2
View File
@@ -161,6 +161,64 @@ You could update your test result in it directly.
Please refer to [Docker with SYCL](../docker.md#docker-with-sycl) for details.
## Quick Development WOW
This chapter is for quick development & try with SYCL backend on Intel GPU.
You need to install following sofeware before development:
- Intel GPU driver
- oneAPI package
- other development tools.
Please refer to [Linux](#linux) or [Windows](#windows-1) for above installation and resolve the trouble in usage. There are the detailed guide.
- Linux
```
## build from source code
./examples/sycl/build.sh
## run CONV_2D_DW unit test cases
./build/bin/test-backend-ops -b SYCL0 -o CONV_2D_DW
## run all unit test cases
./build/bin/test-backend-ops -b SYCL0
## run with LLM on the first GPU
./examples/sycl/test.sh -mg 0 -m xxxx.gguf
## run service with LLM on the first GPU
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
./examples/sycl/start-svr.sh -m xxxx.gguf
## update the docs/ops.md for new/update OPs
./examples/sycl/update-ops-doc.sh
```
- Windows
```
## build from source code
examples\sycl\win-build-sycl.bat
## run CONV_2D_DW unit test cases
build\bin\test-backend-ops.exe -b SYCL0 -o CONV_2D_DW
## run all unit test cases
build\bin\test-backend-ops.exe -b SYCL0
## run LLM on the first GPU
examples\sycl\win-test.bat -mg 0 -m xxxx.gguf
## run service with LLM on the first GPU
set ONEAPI_DEVICE_SELECTOR="level_zero:0"
examples\sycl\win-start-svr.bat -m xxxx.gguf
## update the docs/ops.md for new/update OPs
examples\sycl\win-update-ops-doc.bat
```
## Linux
### I. Setup Environment
@@ -701,7 +759,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
| GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
| GGML_SYCL_SUPPORT_LEVEL_ZERO_API | ON *(default)* \|OFF *(Optional)* | Support to use Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
@@ -716,7 +774,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
| GGML_SYCL_USE_LEVEL_ZERO_API | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO_API=ON at build time. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+3 -3
View File
@@ -27,11 +27,11 @@ Legend:
| COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
+1582 -1582
View File
File diff suppressed because it is too large Load Diff
+9
View File
@@ -0,0 +1,9 @@
#!/bin/bash
# MIT license
# Copyright (C) 2026 Intel Corporation
# SPDX-License-Identifier: MIT
./build/bin/test-backend-ops support --output csv > docs/ops/SYCL.csv
./scripts/create_ops_docs.py
+8
View File
@@ -0,0 +1,8 @@
@echo off
rem MIT license
rem Copyright (C) 2026 Intel Corporation
rem SPDX-License-Identifier: MIT
build\bin\test-backend-ops support --output csv > docs\ops\SYCL.csv
python scripts\create_ops_docs.py
+1 -1
View File
@@ -249,7 +249,7 @@ option(GGML_SYCL "ggml: use SYCL"
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON)
option(GGML_SYCL_SUPPORT_LEVEL_ZERO_API "ggml: use Level Zero API in SYCL backend" ON)
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
+8 -1
View File
@@ -438,7 +438,14 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
ggml_add_cpu_backend_variant(power9 POWER9 VSX)
ggml_add_cpu_backend_variant(power10 POWER10 VSX)
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
# POWER11 backend: only if compiler supports -mcpu=power11
check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11)
if (GGML_CXX_SUPPORTS_POWER11)
message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend")
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
else()
message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11")
endif()
else()
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
endif()
+1 -1
View File
@@ -389,7 +389,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
if (EXTRACTED_NUMBER EQUAL 10 OR EXTRACTED_NUMBER EQUAL 11)
list(APPEND ARCH_FLAGS -mcpu=power10)
elseif (EXTRACTED_NUMBER EQUAL 9)
list(APPEND ARCH_FLAGS -mcpu=power9)
+20 -3
View File
@@ -66,7 +66,6 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_base(ggml
const char * op_str = "undefined";
switch (op) {
case GGML_OP_ADD_ID: op_str = "add_id"; break;
case GGML_OP_CONCAT: op_str = "concat"; break;
default: GGML_ABORT("fatal error");
};
@@ -211,6 +210,21 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat(ggml_meta
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat(ggml_metal_library_t lib, ggml_type tsrc) {
char base[256];
char name[256];
snprintf(base, 256, "kernel_concat_%s", ggml_type_name(tsrc));
snprintf(name, 256, "%s", base);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
}
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary(ggml_metal_library_t lib, const ggml_tensor * op) {
char base[256];
char name[256];
@@ -1689,7 +1703,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm(ggml_metal_
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_library_t lib, const ggml_tensor * op) {
assert(op->op == GGML_OP_ROPE);
assert(op->op == GGML_OP_ROPE || op->op == GGML_OP_ROPE_BACK);
const bool is_back = op->op == GGML_OP_ROPE_BACK;
char base[256];
char name[256];
@@ -1713,13 +1729,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_
snprintf(base, 256, "kernel_rope_norm_%s", ggml_type_name(op->src[0]->type));
}
snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0);
snprintf(name, 256, "%s_imrope=%d_is_back=%d", base, is_imrope ? 1 : 0, is_back ? 1 : 0);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
ggml_metal_cv_t cv = ggml_metal_cv_init();
ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0);
ggml_metal_cv_set_bool(cv, is_back, FC_ROPE + 1);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
+1
View File
@@ -115,6 +115,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat (ggml_metal_library_t lib, enum ggml_type tsrc);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum (ggml_metal_library_t lib, const struct ggml_tensor * op);
+17 -5
View File
@@ -1123,13 +1123,24 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
return true;
case GGML_OP_CONCAT:
{
// kernel_concat copies one float-sized value per element.
// Other scalar types need a type-generic copy kernel first.
const enum ggml_type src0_type = op->src[0]->type;
const enum ggml_type src1_type = op->src[1]->type;
return src0_type == src1_type &&
src0_type == op->type &&
(src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_I32);
if (src0_type != src1_type || src0_type != op->type) {
return false;
}
switch (src0_type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_I64:
return true;
case GGML_TYPE_BF16:
return has_bfloat;
default:
return false;
}
}
case GGML_OP_ADD:
case GGML_OP_SUB:
@@ -1173,6 +1184,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_RMS_NORM:
return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
return true;
case GGML_OP_IM2COL:
return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
+2 -1
View File
@@ -375,6 +375,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
n_fuse = ggml_metal_op_norm(ctx, idx);
} break;
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
{
n_fuse = ggml_metal_op_rope(ctx, idx);
} break;
@@ -556,7 +557,7 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
/*.dim =*/ dim,
};
auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
auto pipeline = ggml_metal_library_get_pipeline_concat(lib, op->type);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
+29 -12
View File
@@ -4358,6 +4358,7 @@ template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_
#endif
constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]];
constant bool FC_rope_is_back [[function_constant(FC_ROPE + 1)]];
static float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
@@ -4381,6 +4382,9 @@ static void rope_yarn(
}
*cos_theta = cos(theta) * mscale;
*sin_theta = sin(theta) * mscale;
if (FC_rope_is_back) {
*sin_theta *= -1.0f;
}
}
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
@@ -7513,14 +7517,15 @@ template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<
template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
template<typename T>
kernel void kernel_concat(
constant ggml_metal_kargs_concat & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
constant ggml_metal_kargs_concat & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i3 = tgpig.z;
const int i2 = tgpig.y;
@@ -7533,21 +7538,33 @@ kernel void kernel_concat(
int o[4] = {0, 0, 0, 0};
o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
device const float * x;
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
device const T * x;
if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
x = (device const float *)(src0 + (i3 )*args.nb03 + (i2 )*args.nb02 + (i1 )*args.nb01 + (i0 )*args.nb00);
x = (device const T *)(src0 + (i3 )*args.nb03 + (i2 )*args.nb02 + (i1 )*args.nb01 + (i0 )*args.nb00);
} else {
x = (device const float *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
x = (device const T *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
}
device float * y = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
device T * y = (device T *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
*y = *x;
}
}
typedef decltype(kernel_concat<float>) kernel_concat_t;
template [[host_name("kernel_concat_f32")]] kernel kernel_concat_t kernel_concat<float>;
template [[host_name("kernel_concat_f16")]] kernel kernel_concat_t kernel_concat<half>;
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat<bfloat>;
#endif
template [[host_name("kernel_concat_i8")]] kernel kernel_concat_t kernel_concat<char>;
template [[host_name("kernel_concat_i16")]] kernel kernel_concat_t kernel_concat<short>;
template [[host_name("kernel_concat_i32")]] kernel kernel_concat_t kernel_concat<int>;
template [[host_name("kernel_concat_i64")]] kernel kernel_concat_t kernel_concat<long>;
template<int nr0, typename args_t>
void kernel_mul_mv_q2_K_f32_impl(
args_t args,
+5 -5
View File
@@ -39,8 +39,8 @@ if (WIN32)
set(CMAKE_CXX_COMPILER "icx")
set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
endif()
# Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled)
if(GGML_SYCL_SUPPORT_LEVEL_ZERO)
# Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO_API is enabled)
if(GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH})
set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH})
if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}")
@@ -105,8 +105,8 @@ endif()
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO_API ${GGML_SYCL_SUPPORT_LEVEL_ZERO_API}")
if (GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
# Link against Level Zero loader for direct device memory allocation.
# Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging
# in the xe kernel driver during multi-GPU inference.
@@ -114,7 +114,7 @@ if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH)
if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}")
message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}")
else()
+5 -5
View File
@@ -12,7 +12,7 @@
#include "common.hpp"
#include <sycl/backend.hpp>
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
#include <level_zero/ze_api.h>
#endif
@@ -84,9 +84,9 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
return sycl_down_blk_size;
}
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
return g_ggml_sycl_enable_level_zero &&
return g_ggml_sycl_use_level_zero_api &&
q.get_device().is_gpu() &&
q.get_backend() == sycl::backend::ext_oneapi_level_zero;
}
@@ -95,7 +95,7 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
if (ggml_sycl_use_level_zero_device_alloc(q)) {
void *ptr = nullptr;
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
@@ -127,7 +127,7 @@ void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
if (!ptr) return;
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
if (ggml_sycl_use_level_zero_device_alloc(q)) {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
zeMemFree(ze_ctx, ptr);
+1 -1
View File
@@ -324,7 +324,7 @@ struct ggml_tensor_extra_gpu {
optimize_feature optimized_feature;
};
extern int g_ggml_sycl_enable_level_zero;
extern int g_ggml_sycl_use_level_zero_api;
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
void ggml_sycl_free_device(void *ptr, sycl::queue &q);
+158
View File
@@ -0,0 +1,158 @@
#include "conv2d-dw.hpp"
struct conv2d_dw_params {
int in_w, in_h;
int out_w, out_h;
int kernel_w, kernel_h;
int stride_x, stride_y;
int padding_x, padding_y;
int dilation_x, dilation_y;
int channels, batches;
};
struct conv2d_dw_kernel_bounds {
int y_min, y_max;
int x_min, x_max;
};
static inline conv2d_dw_kernel_bounds dw_calculate_kernel_bounds(int out_x, int out_y,
const conv2d_dw_params & p) {
conv2d_dw_kernel_bounds bounds;
bounds.y_min = sycl::max(0, (p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
bounds.y_max = sycl::min(p.kernel_h,
(p.in_h + p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
bounds.x_min = sycl::max(0, (p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
bounds.x_max = sycl::min(p.kernel_w,
(p.in_w + p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
return bounds;
}
static inline int dw_calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
return out_coord * stride + kern_coord * dilation - padding;
}
// whcn layout: input/output stored as [N, C, H, W]
struct dw_whcn_layout {
static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
return n * (p.channels * p.in_w * p.in_h) + c * p.in_w * p.in_h + y * p.in_w + x;
}
static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
return c * p.kernel_h * p.kernel_w + ky * p.kernel_w + kx;
}
static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
return n * (p.channels * p.out_w * p.out_h) + c * p.out_w * p.out_h + y * p.out_w + x;
}
static void unpack_indices(int global_idx, const conv2d_dw_params & p,
int & n, int & c, int & out_y, int & out_x) {
out_x = global_idx % p.out_w;
out_y = (global_idx / p.out_w) % p.out_h;
c = (global_idx / (p.out_w * p.out_h)) % p.channels;
n = global_idx / (p.out_w * p.out_h * p.channels);
}
};
// cwhn layout: input/output stored as [N, H, W, C]
struct dw_cwhn_layout {
static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
return n * (p.channels * p.in_w * p.in_h) + (y * p.in_w + x) * p.channels + c;
}
static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
return (ky * p.kernel_w + kx) * p.channels + c;
}
static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
return n * (p.channels * p.out_w * p.out_h) + y * (p.out_w * p.channels) + x * p.channels + c;
}
static void unpack_indices(int global_idx, const conv2d_dw_params & p,
int & n, int & c, int & out_y, int & out_x) {
c = global_idx % p.channels;
out_x = (global_idx / p.channels) % p.out_w;
out_y = (global_idx / (p.channels * p.out_w)) % p.out_h;
n = global_idx / (p.channels * p.out_w * p.out_h);
}
};
template <typename Layout>
static void conv2d_dw_kernel(const float * input, const float * kernel, float * output,
const conv2d_dw_params p, const sycl::nd_item<3> & item_ct1) {
const int global_idx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
const int total_elements = p.batches * p.channels * p.out_h * p.out_w;
if (global_idx >= total_elements) {
return;
}
int n, c, out_y, out_x;
Layout::unpack_indices(global_idx, p, n, c, out_y, out_x);
float acc = 0.0f;
const conv2d_dw_kernel_bounds bounds = dw_calculate_kernel_bounds(out_x, out_y, p);
for (int ky = bounds.y_min; ky < bounds.y_max; ++ky) {
const int in_y = dw_calculate_input_coord(out_y, ky, p.stride_y, p.dilation_y, p.padding_y);
for (int kx = bounds.x_min; kx < bounds.x_max; ++kx) {
const int in_x = dw_calculate_input_coord(out_x, kx, p.stride_x, p.dilation_x, p.padding_x);
acc += input[Layout::input_index(n, c, in_y, in_x, p)] *
kernel[Layout::kernel_index(c, ky, kx, p)];
}
}
output[Layout::output_index(n, c, out_y, out_x, p)] = acc;
}
template <typename Layout>
static void conv2d_dw_sycl(const float * x_d, const float * w_d, float * y_d,
const conv2d_dw_params p, const queue_ptr & stream) {
const int total = p.batches * p.channels * p.out_h * p.out_w;
const int num_blocks = (total + SYCL_CONV2D_DW_BLOCK_SIZE - 1) / SYCL_CONV2D_DW_BLOCK_SIZE;
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_DW_BLOCK_SIZE);
const sycl::range<3> block_nums(1, 1, num_blocks);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
conv2d_dw_kernel<Layout>(x_d, w_d, y_d, p, item_ct1);
});
}
void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * input = dst->src[1];
GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
const float * w_d = (const float *) kernel->data;
const float * x_d = (const float *) input->data;
float * y_d = (float *) dst->data;
const int32_t * p = (const int32_t *) dst->op_params;
const int stride_x = p[0];
const int stride_y = p[1];
const int padding_x = p[2];
const int padding_y = p[3];
const int dilation_x = p[4];
const int dilation_y = p[5];
const int in_w = input->ne[0];
const int in_h = input->ne[1];
const int kernel_w = kernel->ne[0];
const int kernel_h = kernel->ne[1];
const int out_w = dst->ne[0];
const int out_h = dst->ne[1];
const int channels = dst->ne[2];
const int batches = dst->ne[3];
const conv2d_dw_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h,
stride_x, stride_y, padding_x, padding_y,
dilation_x, dilation_y, channels, batches };
const queue_ptr stream = ctx.stream();
if (ggml_is_contiguous(input)) {
conv2d_dw_sycl<dw_whcn_layout>(x_d, w_d, y_d, params, stream);
} else if (ggml_is_contiguous_channels(input)) {
conv2d_dw_sycl<dw_cwhn_layout>(x_d, w_d, y_d, params, stream);
} else {
GGML_ABORT("Unsupported memory layout for conv2d_dw");
}
}
+10
View File
@@ -0,0 +1,10 @@
#ifndef GGML_SYCL_CONV2D_DW_HPP
#define GGML_SYCL_CONV2D_DW_HPP
#include "common.hpp"
#define SYCL_CONV2D_DW_BLOCK_SIZE 256
void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_CONV2D_DW_HPP
+125
View File
@@ -0,0 +1,125 @@
#include "conv2d-transpose.hpp"
#include "convert.hpp"
template <typename kernel_t>
static void conv2d_transpose_kernel(const float * input, const kernel_t * kernel, float * output,
const int in_w, const int in_h,
const int out_w, const int out_h,
const int kernel_w, const int kernel_h,
const int stride,
const int c_in, const int c_out, const int batches,
const sycl::nd_item<3> & item_ct1) {
const int global_idx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
const int total_elements = out_w * out_h * c_out * batches;
if (global_idx >= total_elements) {
return;
}
const int out_x = global_idx % out_w;
const int out_y = (global_idx / out_w) % out_h;
const int c_idx = (global_idx / (out_w * out_h)) % c_out;
const int n_idx = global_idx / (out_w * out_h * c_out);
float acc = 0.0f;
for (int c_in_idx = 0; c_in_idx < c_in; ++c_in_idx) {
for (int kh = 0; kh < kernel_h; ++kh) {
int in_y = out_y - kh;
if (in_y < 0 || in_y % stride) {
continue;
}
in_y /= stride;
if (in_y >= in_h) {
continue;
}
for (int kw = 0; kw < kernel_w; ++kw) {
int in_x = out_x - kw;
if (in_x < 0 || in_x % stride) {
continue;
}
in_x /= stride;
if (in_x >= in_w) {
continue;
}
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + in_w * in_y + in_x;
const int kernel_idx = (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx +
kernel_w * kh + kw;
acc += input[input_idx] * ggml_sycl_cast<float>(kernel[kernel_idx]);
}
}
}
output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + out_w * out_y + out_x] = acc;
}
template <typename kernel_t>
static void conv2d_transpose_sycl(const float * input_d, const kernel_t * kernel_d, float * output_d,
const int in_w, const int in_h,
const int out_w, const int out_h,
const int kernel_w, const int kernel_h,
const int stride,
const int c_in, const int c_out, const int batches,
const queue_ptr & stream) {
const int total = out_w * out_h * c_out * batches;
const int num_blocks = (total + SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE;
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE);
const sycl::range<3> block_nums(1, 1, num_blocks);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
conv2d_transpose_kernel<kernel_t>(input_d, kernel_d, output_d,
in_w, in_h, out_w, out_h, kernel_w, kernel_h,
stride, c_in, c_out, batches, item_ct1);
});
}
// input: (W, H, C_in, N)
// kernel: (W, H, C_out, C_in)
// output: (W, H, C_out, N)
void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * input = dst->src[1];
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(input));
GGML_ASSERT(ggml_is_contiguous(kernel));
GGML_ASSERT(ggml_is_contiguous(dst));
const float * input_d = (const float *) input->data;
float * output_d = (float *) dst->data;
const void * kernel_d = kernel->data;
const int input_w = input->ne[0];
const int input_h = input->ne[1];
const int channels_in = input->ne[2];
const int batches = input->ne[3];
const int output_w = dst->ne[0];
const int output_h = dst->ne[1];
const int channels_out = kernel->ne[2];
const int kernel_w = kernel->ne[0];
const int kernel_h = kernel->ne[1];
const int stride = dst->op_params[0];
GGML_ASSERT(channels_in == kernel->ne[3]);
GGML_ASSERT(stride > 0);
const queue_ptr stream = ctx.stream();
if (kernel->type == GGML_TYPE_F16) {
conv2d_transpose_sycl<sycl::half>(input_d, (const sycl::half *) kernel_d, output_d,
input_w, input_h, output_w, output_h, kernel_w, kernel_h,
stride, channels_in, channels_out, batches, stream);
} else {
conv2d_transpose_sycl<float>(input_d, (const float *) kernel_d, output_d,
input_w, input_h, output_w, output_h, kernel_w, kernel_h,
stride, channels_in, channels_out, batches, stream);
}
}
+10
View File
@@ -0,0 +1,10 @@
#ifndef GGML_SYCL_CONV2D_TRANSPOSE_HPP
#define GGML_SYCL_CONV2D_TRANSPOSE_HPP
#include "common.hpp"
#define SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE 256
void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_CONV2D_TRANSPOSE_HPP
+150
View File
@@ -0,0 +1,150 @@
#include "conv2d.hpp"
#include "convert.hpp"
struct conv2d_params {
const int64_t IW, IH;
const int64_t OW, OH;
const int64_t KW, KH;
const int64_t ST_X, ST_Y;
const int64_t PD_X, PD_Y;
const int64_t DL_X, DL_Y;
const int64_t IC, OC;
const int64_t B;
const int64_t TOTAL;
};
struct conv2d_kernel_bounds {
int64_t y_min, y_max;
int64_t x_min, x_max;
};
static inline int64_t conv2d_max64(int64_t a, int64_t b) {
return (a > b) ? a : b;
}
static inline int64_t conv2d_min64(int64_t a, int64_t b) {
return (a < b) ? a : b;
}
static inline conv2d_kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv2d_params & P) {
conv2d_kernel_bounds bounds;
bounds.y_min = conv2d_max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
bounds.y_max = conv2d_min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
bounds.x_min = conv2d_max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
bounds.x_max = conv2d_min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
return bounds;
}
static inline int calculate_input_coord(int64_t out_coord, int64_t kern_coord, int64_t stride,
int64_t dilation, int64_t padding) {
return out_coord * stride + kern_coord * dilation - padding;
}
// whcn layout helpers (matching ggml tensor memory order)
static inline int64_t whcn_input_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
return n * (P.IC * P.IW * P.IH) + c * P.IW * P.IH + y * P.IW + x;
}
static inline int64_t whcn_kernel_index(int64_t c_out, int64_t c_in, int64_t ky, int64_t kx, const conv2d_params & P) {
return c_out * (P.IC * P.KH * P.KW) + c_in * (P.KH * P.KW) + ky * P.KW + kx;
}
static inline int64_t whcn_output_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
return n * (P.OC * P.OW * P.OH) + c * P.OW * P.OH + y * P.OW + x;
}
template <typename T>
static void conv2d_kernel(const float * input, const T * kernel, float * output,
const conv2d_params P, const sycl::nd_item<3> & item_ct1) {
const int64_t global_idx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
if (global_idx >= P.TOTAL) {
return;
}
const int64_t out_x = global_idx % P.OW;
const int64_t out_y = (global_idx / P.OW) % P.OH;
const int64_t c_out = (global_idx / (P.OW * P.OH)) % P.OC;
const int64_t n = global_idx / (P.OW * P.OH * P.OC);
float acc = 0.0f;
const conv2d_kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
const float input_val = input[whcn_input_index(n, c_in, in_y, in_x, P)];
const T kernel_val = kernel[whcn_kernel_index(c_out, c_in, ky, kx, P)];
acc += input_val * ggml_sycl_cast<float>(kernel_val);
}
}
}
output[whcn_output_index(n, c_out, out_y, out_x, P)] = acc;
}
template <typename T>
static void conv2d_sycl(const float * X_D, const T * K_D, float * Y_D,
const conv2d_params P, const queue_ptr & stream) {
const int num_blocks = (P.TOTAL + SYCL_CONV2D_BLOCK_SIZE - 1) / SYCL_CONV2D_BLOCK_SIZE;
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_BLOCK_SIZE);
const sycl::range<3> block_nums(1, 1, num_blocks);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
conv2d_kernel<T>(X_D, K_D, Y_D, P, item_ct1);
});
}
void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * input = dst->src[1];
const float * K_D = (const float *) kernel->data;
const float * X_D = (const float *) input->data;
float * Y_D = (float *) dst->data;
GGML_ASSERT(ggml_is_contiguous(kernel));
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
GGML_ASSERT(input->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
// same number of input channels
GGML_ASSERT(input->ne[2] == kernel->ne[2]);
const queue_ptr stream = ctx.stream();
const int32_t * p = (const int32_t *) dst->op_params;
const int ST_X = p[0];
const int ST_Y = p[1];
const int PD_X = p[2];
const int PD_Y = p[3];
const int DL_X = p[4];
const int DL_Y = p[5];
// no cwhn layout support
GGML_ASSERT(p[6] == 0);
const int IW = input->ne[0];
const int IH = input->ne[1];
const int OW = dst->ne[0];
const int OH = dst->ne[1];
const int KW = kernel->ne[0];
const int KH = kernel->ne[1];
const int IC = input->ne[2];
const int OC = kernel->ne[3];
const int B = input->ne[3];
const int64_t total = (int64_t) B * OC * OH * OW;
const conv2d_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
if (kernel->type == GGML_TYPE_F16) {
conv2d_sycl<sycl::half>(X_D, (const sycl::half *) K_D, Y_D, params, stream);
} else {
conv2d_sycl<float>(X_D, K_D, Y_D, params, stream);
}
}
+10
View File
@@ -0,0 +1,10 @@
#ifndef GGML_SYCL_CONV2D_HPP
#define GGML_SYCL_CONV2D_HPP
#include "common.hpp"
#define SYCL_CONV2D_BLOCK_SIZE 256
void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_CONV2D_HPP
+6
View File
@@ -642,6 +642,8 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
switch (type) {
case GGML_TYPE_Q1_0:
return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
@@ -724,6 +726,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
switch (type) {
case GGML_TYPE_Q1_0:
return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
@@ -830,6 +834,8 @@ to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
case GGML_TYPE_BF16:
return convert_unary_nc_sycl<sycl::ext::oneapi::bfloat16>;
#endif
case GGML_TYPE_Q1_0:
return dequantize_block_nc_sycl<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
return dequantize_block_nc_sycl<QK4_0, QR4_0, dequantize_q4_0>;
case GGML_TYPE_Q4_1:
+15
View File
@@ -70,6 +70,21 @@ static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int
#endif // GGML_SYCL_F16
}
static __dpct_inline__ void dequantize_q1_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
const int iqs, dfloat2 &v) {
// Q1_0 reorder layout: scale values followed by quantized bits
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
const int bit_index_0 = iqs + 0;
const int bit_index_1 = iqs + 1;
const int bit_0 = (*((const uint8_t *)qs + bit_index_0 / 8) >> (bit_index_0 % 8)) & 1;
const int bit_1 = (*((const uint8_t *)qs + bit_index_1 / 8) >> (bit_index_1 % 8)) & 1;
v.x() = (2 * bit_0 - 1) * d;
v.y() = (2 * bit_1 - 1) * d;
}
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
const int iqs, dfloat2 &v) {
const block_q4_1 * x = (const block_q4_1 *) vx;
+53
View File
@@ -1423,6 +1423,50 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
}
}
static void dequantize_mul_mat_vec_q1_0_sycl_reorder(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
dequantize_mul_mat_vec_reorder<QK1_0, QR1_0, dequantize_q1_0_reorder>(
vx, y, dst, ncols, nrows, item_ct1);
});
}
}
static void dequantize_mul_mat_vec_q1_0_sycl(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
dequantize_mul_mat_vec<QK1_0, QR1_0, dequantize_q1_0>(
vx, y, dst, ncols, nrows, item_ct1);
});
}
}
static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
@@ -1759,6 +1803,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
sycl::half *src1_dfloat = nullptr; // dfloat == half
bool src1_convert_f16 =
src0->type == GGML_TYPE_Q1_0 ||
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
@@ -1777,6 +1822,14 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
#endif // GGML_SYCL_F16
switch (src0->type) {
case GGML_TYPE_Q1_0:
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
dequantize_mul_mat_vec_q1_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
} else {
dequantize_mul_mat_vec_q1_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q4_0:
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+55 -29
View File
@@ -32,7 +32,7 @@
#include <sycl/sycl.hpp>
#include <sycl/backend.hpp>
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
#include <level_zero/ze_api.h>
#endif
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
@@ -62,6 +62,9 @@
#include "ggml-sycl/repeat_back.hpp"
#include "ggml-sycl/set_rows.hpp"
#include "ggml-sycl/set.hpp"
#include "ggml-sycl/conv2d.hpp"
#include "ggml-sycl/conv2d-dw.hpp"
#include "ggml-sycl/conv2d-transpose.hpp"
#include "ggml-sycl/ssm_conv.hpp"
#include "ggml-sycl/sycl_hw.hpp"
#include "ggml-sycl/ssm_scan.hpp"
@@ -84,7 +87,7 @@ int g_ggml_sycl_enable_vmm = 1;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_use_async_mem_op = 0;
int g_ggml_sycl_use_async_mem_op_requested = 1;
int g_ggml_sycl_enable_level_zero = 0;
int g_ggml_sycl_use_level_zero_api = 0;
int g_ggml_sycl_enable_flash_attention = 1;
int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
int g_ggml_sycl_usm_system = 0;
@@ -154,7 +157,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.ext_oneapi_level_zero = false;
}
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) {
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device.default_queue().get_device());
ze_device_properties_t props = {};
@@ -169,13 +172,13 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.default_tensor_split[id] /= total_vram;
}
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
// Large buffers can be allocated before ggml_check_sycl() initializes other
// g_ggml_sycl_enable_* globals, so initialize this one as early as we can.
g_ggml_sycl_enable_level_zero =
info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
g_ggml_sycl_use_level_zero_api =
info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_USE_LEVEL_ZERO_API", 1);
#else
g_ggml_sycl_enable_level_zero = 0;
g_ggml_sycl_use_level_zero_api = 0;
#endif
return info;
@@ -274,7 +277,7 @@ static void ggml_check_sycl() try {
g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL);
if (g_ggml_sycl_enable_level_zero == 0) {
if (g_ggml_sycl_use_level_zero_api == 0) {
g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
}
@@ -309,10 +312,10 @@ static void ggml_check_sycl() try {
#else
GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
#endif
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: yes\n");
#else
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: no\n");
#endif
#if defined(GGML_SYCL_USE_VMM)
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
@@ -328,12 +331,12 @@ static void ggml_check_sycl() try {
#else
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
#endif
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: %d\n", g_ggml_sycl_use_level_zero_api);
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy);
#else
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n",
GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: Disable Level Zero API usage by compile flag\n");
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO_API\n",
g_ggml_sycl_dev2dev_memcpy);
#endif
#if GGML_SYCL_DNNL
@@ -599,7 +602,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
static bool ggml_sycl_is_l0_discrete_gpu(int device) {
return ggml_sycl_info().devices[device].l0_discrete_gpu;
}
@@ -608,12 +611,12 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) {
static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) {
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
const bool l0_copy_supported =
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
if (g_ggml_sycl_use_level_zero_api && l0_copy_supported) {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
@@ -973,6 +976,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
}
switch(type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return max_compute_capability >= VER_GEN9 ? 128 : 64;
@@ -3504,6 +3508,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
@@ -3519,6 +3524,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
@@ -3529,6 +3535,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q3_K:
@@ -3543,6 +3550,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -4664,12 +4672,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_ARGMAX:
ggml_sycl_argmax(ctx, dst);
break;
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_sycl_op_conv_transpose_1d(ctx, dst);
case GGML_OP_CONV_2D:
ggml_sycl_op_conv2d(ctx, dst);
break;
case GGML_OP_CONV_2D_DW:
ggml_sycl_op_conv2d_dw(ctx, dst);
break;
case GGML_OP_CONV_3D:
ggml_sycl_conv_3d(ctx, dst);
break;
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_sycl_op_conv_transpose_1d(ctx, dst);
break;
case GGML_OP_CONV_TRANSPOSE_2D:
ggml_sycl_op_conv2d_transpose(ctx, dst);
break;
case GGML_OP_REPEAT:
ggml_sycl_repeat(ctx, dst);
break;
@@ -5373,7 +5390,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_
return nullptr;
}
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
static bool do_ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_sycl_device_context *sycl_ctx =
(ggml_backend_sycl_device_context *)dev->context;
int device = sycl_ctx->device;
@@ -5387,6 +5404,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
}
return false;
}
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_2D:
return true;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_SGN:
@@ -5434,19 +5455,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
struct ggml_tensor * a = op->src[0];
struct ggml_tensor * b = op->src[1];
// disable Q1_0 until implementation
if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
return false;
}
if (a->ne[3] != b->ne[3]) {
return false;
}
ggml_type src0_type = op->src[0]->type;
// TODO: The configuration below needs more work to be supported with oneDNN
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
@@ -5456,12 +5470,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
// TODO: This specific configuration can fail with oneDNN and needs more debugging
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
printf("zjy 2\n");
return false;
}
return true;
}
case GGML_OP_OUT_PROD:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
return op->type == GGML_TYPE_F32 &&
(op->src[0]->type == GGML_TYPE_F32 ||
(op->src[0]->type == GGML_TYPE_Q1_0 && op->src[0]->ne[2] == op->src[1]->ne[2] &&
op->src[0]->ne[3] == op->src[1]->ne[3])) &&
op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_GET_ROWS:
{
switch (op->src[0]->type) {
@@ -5718,6 +5737,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
GGML_UNUSED(dev);
}
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
bool res = do_ggml_backend_sycl_device_supports_op(dev, op);
GGML_SYCL_DEBUG("[SYCL] call %s op->op=%s op->type=%s -> %s\n", __func__, ggml_op_name(op->op),
ggml_type_name(op->type), res ? "true" : "false");
return res;
}
static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) {
return false;
+74
View File
@@ -1194,6 +1194,66 @@ static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols(
}
}
static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy,
float * dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK1_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q<QK1_0, QI1_0, block_q1_0,
VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
});
}
template <int ncols_dst>
static void mul_mat_vec_q1_0_q8_1_sycl_ncols(
const void * vx, const void * vy, float * dst,
const int ncols, const int nrows,
const int stride_col_y, const int stride_col_dst,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK1_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_ncols<QK1_0, QI1_0, block_q1_0,
VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1, ncols_dst>(
vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1);
});
});
}
static void mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
const void * vx, const void * vy, float * dst,
const int ncols, const int nrows, const int ncols_dst,
const int stride_col_y, const int stride_col_dst,
dpct::queue_ptr stream) {
switch (ncols_dst) {
case 1: mul_mat_vec_q1_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break;
case 2: mul_mat_vec_q1_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 3: mul_mat_vec_q1_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 4: mul_mat_vec_q1_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 5: mul_mat_vec_q1_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 6: mul_mat_vec_q1_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 7: mul_mat_vec_q1_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 8: mul_mat_vec_q1_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
default: GGML_ABORT("unsupported ncols_dst=%d for Q1_0 multi-col MMVQ", ncols_dst);
}
}
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
@@ -2120,6 +2180,20 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q1_0:
if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
const int stride_col_y = src1_padded_col_size / QK8_1;
const int stride_col_dst = dst->ne[0];
GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols);
mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff,
src1_ncols, stride_col_y, stride_col_dst, stream);
return;
} else if (i == 0 || src1_ncols == 1) {
GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl\n");
mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q2_K:
if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
const int stride_col_y = src1_padded_col_size / QK8_1;
+45 -9
View File
@@ -1,11 +1,12 @@
#include "outprod.hpp"
#include "convert.hpp"
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q1_0);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
@@ -20,11 +21,31 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
GGML_ASSERT(ne01 == ne11); // Inner dimensions must match
GGML_ASSERT(ne0 == ne00); // Output rows match src0 rows
GGML_ASSERT(ne1 == ne10); // Output cols match src1 cols
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
GGML_ASSERT(ne2 % ne02 == 0);
GGML_ASSERT(ne3 % ne03 == 0);
// Get data pointers
const float* src0_d = (const float*)src0->data;
const float* src1_d = (const float*)src1->data;
float* dst_d = (float*)dst->data;
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
ggml_sycl_pool_alloc<float> src0_as_f32(ctx.pool());
int64_t src0_nb02 = nb02;
int64_t src0_nb03 = nb03;
if (src0->type == GGML_TYPE_Q1_0) {
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
" : converting src0 Q1_0 to fp32");
src0_d = src0_as_f32.alloc(ne00 * ne01 * ne02 * ne03);
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
GGML_ASSERT(to_fp32_sycl != nullptr);
to_fp32_sycl(src0->data, const_cast<float *>(src0_d), ne00 * ne01 * ne02 * ne03, stream);
// Dequantized src0 buffer is contiguous fp32 [ne00, ne01, ne02, ne03].
src0_nb02 = ne00 * ne01 * (int64_t) sizeof(float);
src0_nb03 = ne00 * ne01 * ne02 * (int64_t) sizeof(float);
}
// GEMM parameters
const float alpha = 1.0f;
@@ -35,12 +56,27 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
const int64_t r2 = ne2 / ne02;
const int64_t r3 = ne3 / ne03;
try {
// Perform matrix multiplication using oneMKL GEMM
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
}
catch (sycl::exception const& exc) {
// OUT_PROD applies independently to each (i2, i3) destination plane.
for (int64_t i3 = 0; i3 < ne3; ++i3) {
for (int64_t i2 = 0; i2 < ne2; ++i2) {
const int64_t i03 = i3 / r3;
const int64_t i02 = i2 / r2;
const float * src0_plane = (const float *) ((const char *) src0_d + i02 * src0_nb02 + i03 * src0_nb03);
const float * src1_plane = (const float *) ((const char *) src1_d + i2 * nb12 + i3 * nb13);
float * dst_plane = (float *) ((char *) dst_d + i2 * nb2 + i3 * nb3);
// Perform matrix multiplication using oneMKL GEMM
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
ne0, ne1, ne01, alpha, src0_plane, ne00,
src1_plane, ldb, beta, dst_plane, ne0);
}
}
} catch (sycl::exception const& exc) {
std::cerr << exc.what() << std::endl;
GGML_ASSERT(false);
}
+35
View File
@@ -309,6 +309,41 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
}
#define VDR_Q1_0_Q8_1_MMVQ 1
#define VDR_Q1_0_Q8_1_MMQ 4
static __dpct_inline__ float
vec_dot_q1_0_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq;
const block_q8_1 * bq8_1_chunk = bq8_1 + iqs;
const float d1 = bq1_0->d;
const int v = get_int_from_uint8_aligned(bq1_0->qs, iqs);
int vi_bytes[8];
#pragma unroll
for (int j = 0; j < 8; ++j) {
const int shift = j * 4;
const int bits4 = (v >> shift) & 0x0F;
const int b0 = (bits4 & 0x01) ? 1 : -1;
const int b1 = (bits4 & 0x02) ? 1 : -1;
const int b2 = (bits4 & 0x04) ? 1 : -1;
const int b3 = (bits4 & 0x08) ? 1 : -1;
vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
}
int sumi = 0;
#pragma unroll
for (int j = 0; j < 8; ++j) {
const int u = get_int_from_int8_aligned(bq8_1_chunk->qs, j);
sumi = ggml_sycl_dp4a(vi_bytes[j], u, sumi);
}
return d1 * bq8_1_chunk->ds[0] * sumi;
}
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+1 -1
View File
@@ -1382,7 +1382,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
const auto & hparams = model.hparams;
// eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
const int64_t n_embd = hparams.n_embd_inp();
const int64_t n_embd = hparams.n_embd_inp_enc();
const int64_t n_vocab = model.vocab.n_tokens();
// note: during encode, we always pass the full sequence starting from pos = 0
+4
View File
@@ -104,6 +104,10 @@ uint32_t llama_hparams::n_embd_inp() const {
return n_embd_inp;
}
uint32_t llama_hparams::n_embd_inp_enc() const {
return n_embd_inp_enc_impl > 0 ? n_embd_inp_enc_impl : n_embd_inp();
}
uint32_t llama_hparams::n_embd_out() const {
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
}
+7
View File
@@ -189,6 +189,10 @@ struct llama_hparams {
// input embedding dimension (0 = use n_embd)
uint32_t n_embd_inp_impl = 0;
// encoder input embedding dimension (0 = use n_embd_inp())
// e.g. the eagle3 encoder fuses target_layers * target_hidden features
uint32_t n_embd_inp_enc_impl = 0;
// output embedding dimension (0 = use n_embd)
uint32_t n_embd_out_impl = 0;
@@ -305,6 +309,9 @@ struct llama_hparams {
// dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const;
// dimension of the encoder input embeddings
uint32_t n_embd_inp_enc() const;
// dimension of output embeddings
uint32_t n_embd_out() const;
+1 -1
View File
@@ -249,7 +249,7 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama
}
// if using single GPU mode, remove all except the main GPU
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
if (params.split_mode == LLAMA_SPLIT_MODE_NONE && !model->devices.empty()) {
if (params.main_gpu < 0) {
model->devices.clear();
} else {
+4 -4
View File
@@ -19,7 +19,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
// eagle3 norm_before_residual (optional, default false)
// compatible with Readhat eagle3 speculator model
@@ -34,7 +34,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
LLAMA_LOAD_LOCALS;
const int64_t n_embd_inp = hparams.n_embd_inp();
const int64_t n_embd_inp = hparams.n_embd_inp_enc();
const int64_t n_embd_attn_input = 2 * n_embd;
// Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
@@ -109,8 +109,8 @@ ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
// Input: Target model features (3 layers concatenated: low, mid, high)
// Data will be provided via ubatch->embd in encode_eagle3_features()
auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp_enc());
inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens);
ggml_set_input(inp_target->embd);
cur = inp_target->embd;
+21
View File
@@ -1105,6 +1105,8 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
output.entries.push_back(std::move(res));
}
output.grid_x = inst.grid_size.width;
output.grid_y = inst.grid_size.height;
return true;
}
@@ -1558,3 +1560,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
output.entries.push_back(std::move(img_f32));
return true;
}
bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
// call super class preprocessor
bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
if (!ok) {
return false;
}
if (output.entries.size() == 1) {
// Single-tile (overview only): append one newline row.
output.entries[0]->add_newline = true;
} else {
// Multi-tile: overview gets no newline, grid tiles get one.
output.entries[0]->add_newline = false;
for (size_t i = 1; i < output.entries.size(); ++i) {
output.entries[i]->add_newline = true;
}
}
return true;
}
+6
View File
@@ -197,3 +197,9 @@ struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
// similar to llava_uhd, but has add_newline
struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
+6 -25
View File
@@ -639,7 +639,7 @@ struct mtmd_context {
{
img_beg = "<image>";
img_end = "";
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
} break;
default:
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
GGML_ASSERT(!bitmaps.empty());
if (!bitmaps[0]->is_audio) {
// note: only one type of media is supported per call, caller should enforce this
const bool is_vision = !bitmaps[0]->is_audio;
if (is_vision) {
// handle image
if (!ctx->ctx_v) {
@@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
batch_f32.grid_y = tmp_batch.grid_y;
}
// Annotate llava-next style tiles so clip_n_output_tokens accounts
// for per-tile newline injection.
if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
if (batch_f32.entries.size() == 1) {
// Single-tile (overview only): append one newline row.
batch_f32.entries[0]->add_newline = true;
} else {
// Multi-tile: overview gets no newline, grid tiles get one.
batch_f32.entries[0]->add_newline = false;
for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
batch_f32.entries[i]->add_newline = true;
}
}
}
// handle llava-uhd style preprocessing
const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
if (
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
) {
if (has_tiling_grid) {
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
GGML_ASSERT(bitmaps.size() == 1);
+18
View File
@@ -180,6 +180,24 @@ That requires `JSON.stringify` when formatted to message content:
}
```
### Model management API (router mode)
Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)
The main goal of this API is to allow downloading models and/or removing models from the web UI. It relies on the model cache infrastructure under the hood to manage the list of models dynamically.
Instead of building everything from the ground up (like what most AI agents will do when you ask them to implement a similar feature), we built on top of existing, already well-engineered components inside the codebase:
- Model cache infrastructure as mentioned above (`common/download.h`)
- Server response queue (`server-queue.h`). We use this feature to broadcast events to SSE clients.
- Server router thread management (`server-models.h`). We re-use the same thread model that is used for managing subprocess life cycle, except that we don't create a new subprocess, but launch the download right inside the thread.
The flow for downloading a new model:
- POST request comes in --> `post_router_models` --> validation
- `server_models::download()` is called
- Sets up a new thread `inst.th` and runs the download inside
- If a stop request comes in, set `stop_download` to `true`
- Otherwise, upon completion, we call `load_models()` to refresh the list of models
### Notable Related PRs
- Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443
+115
View File
@@ -1778,6 +1778,20 @@ The `status` object can be:
}
```
Note: for "downloading" state, there can be multiple files be downloading in parallel
```json
"status": {
"value": "downloading",
"progress": {
"https://...model.gguf": {
"done": 195963406,
"total": 219307424
}
}
}
```
### POST `/models/load`: Load a model
Load a model
@@ -1820,6 +1834,107 @@ Response:
}
```
### GET `/models/sse`: Real-time events
Example events:
```js
{
"model": "...",
"event": "model_status",
"data": {
"status": "loading"
}
}
{
"model": "...",
"event": "download_progress",
"data": {
// note: there can be multiple files being downloaded in parallel
"https://...model.gguf": {
"done": 195963406,
"total": 219307424
}
}
}
{
"model": "...",
"event": "download_finished",
"data": {
"status": "loading"
}
}
{
"model": "...",
"event": "model_remove"
}
// special event: reload of the list of all models
{
"model": "*",
"event": "models_reload"
}
```
### POST `/models`: Download new model
Trigger a new download (non-blocking), the progress can be tracked via SSE endpoint `/models/sse`
To cancel model downloading, send an event to `/models/unload`
Download procedure:
- Send POST request to `/models`
- Subscribe to `/models/sse` for updates
- On downloading completed, you will receive either `download_finished` or `download_failed` event
- Call GET `/models` to trigger model list update. If the download success, you should see the new model in the list
Payload:
```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
}
```
Response (download is started in the background):
```json
{
"success": true
}
```
Response (error, cannot start the download):
```json
{
"error": {
"code": 400,
"message": "model validation failed, unable to download",
"type": "invalid_request_error"
}
}
```
### DELETE `/models`: Delete a model from cache
IMPORTANT: only model stored in cache can be deleted. You cannot delete models in a preset.
Model name must be passed via query param: `?model={name}`
If delete success, it will send an SSE event of type `model_remove`
Response:
```json
{
"success": true
}
```
## API errors
`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
+17
View File
@@ -588,6 +588,23 @@ void server_http_context::post(const std::string & path, const server_http_conte
});
}
void server_http_context::del(const std::string & path, const server_http_context::handler_t & handler) const {
handlers.emplace(path, handler);
pimpl->srv->Delete(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
get_params(req),
get_headers(req),
req.path,
build_query_string(req),
req.body,
{},
req.is_connection_closed
});
server_http_res_ptr response = handler(*request);
process_handler_response(std::move(request), response, res);
});
}
//
// Vertex AI Prediction protocol (AIP_PREDICT_ROUTE)
// https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements
+1
View File
@@ -86,6 +86,7 @@ struct server_http_context {
void get(const std::string & path, const handler_t & handler) const;
void post(const std::string & path, const handler_t & handler) const;
void del(const std::string & path, const handler_t & handler) const;
// Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict)
// Must be called AFTER all other API routes are registered
+398 -35
View File
@@ -9,6 +9,7 @@
#include <sheredom/subprocess.h>
#include <functional>
#include <optional>
#include <algorithm>
#include <thread>
#include <mutex>
@@ -51,6 +52,21 @@ extern char **environ;
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
#define CHILD_ADDR "127.0.0.1"
struct server_subproc {
std::optional<subprocess_s> sproc; // empty while in DOWNLOADING state
std::atomic<bool> stop_download{false}; // flag to signal download cancellation
subprocess_s & get() {
GGML_ASSERT(sproc.has_value() && "subprocess not initialized");
return sproc.value();
}
bool is_alive() {
return sproc.has_value() && subprocess_alive(&sproc.value());
}
};
static std::filesystem::path get_server_exec_path() {
#if defined(_WIN32)
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
@@ -272,12 +288,25 @@ void server_models::add_model(server_model_meta && meta) {
meta.update_caps();
std::string name = meta.name;
mapping[name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
/* subproc */ std::make_shared<server_subproc>(),
/* th */ std::thread(),
/* meta */ std::move(meta)
};
}
void server_models::notify_sse(const std::string & event, const std::string & model_id, const json & data) {
std::unique_ptr<server_task_result_router> result = std::make_unique<server_task_result_router>();
result->data = {
{"model", model_id},
{"event", event},
};
if (!data.is_null()) {
result->data["data"] = data;
}
SRV_DBG("notifying SSE clients about event '%s' for model '%s': %s\n", event.c_str(), model_id.c_str(), safe_json_to_str(result->data).c_str());
sse.broadcast(std::move(result));
}
void server_models::load_models() {
// Phase 1: load presets from all sources — pure I/O, no lock needed
// 1. cached models
@@ -304,19 +333,27 @@ void server_models::load_models() {
// note: if a model exists in both cached and local, local takes precedence
common_presets final_presets;
for (const auto & [name, preset] : cached_models) final_presets[name] = preset;
for (const auto & [name, preset] : local_models) final_presets[name] = preset;
std::unordered_map<std::string, server_model_source> source_map;
for (const auto & [name, preset] : cached_models) {
final_presets[name] = preset;
source_map[name] = SERVER_MODEL_SOURCE_CACHE;
}
for (const auto & [name, preset] : local_models) {
final_presets[name] = preset;
source_map[name] = SERVER_MODEL_SOURCE_MODELS_DIR;
}
for (const auto & [name, custom] : custom_presets) {
if (final_presets.find(name) != final_presets.end()) {
final_presets[name].merge(custom);
} else {
final_presets[name] = custom;
}
source_map[name] = SERVER_MODEL_SOURCE_PRESET;
}
// server base preset from CLI args takes highest precedence
for (auto & [name, preset] : final_presets) {
preset.merge(base_preset);
}
auto get_source = [&](const std::string & name) {
return source_map.count(name) ? source_map.at(name) : SERVER_MODEL_SOURCE_PRESET;
};
// Helpers that read `mapping` — must be called while holding the lock.
std::unordered_set<std::string> custom_names;
@@ -366,12 +403,15 @@ void server_models::load_models() {
// (unload, load) or when joining threads (the monitoring thread calls update_status
// which locks the mutex, so joining while holding it would deadlock).
std::unique_lock<std::mutex> lk(mutex);
need_reload = false;
bool is_first_load = mapping.empty();
if (is_first_load) {
// FIRST LOAD: add all models, then unlock for autoloading
for (const auto & [name, preset] : final_presets) {
server_model_meta meta{
/* source */ get_source(name),
/* preset */ preset,
/* name */ name,
/* aliases */ {},
@@ -384,7 +424,7 @@ void server_models::load_models() {
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* multimodal */ mtmd_caps{false, false},
/* need_download */ false,
// /* need_download */ false,
};
add_model(std::move(meta));
}
@@ -453,6 +493,9 @@ void server_models::load_models() {
}
}
for (auto & [name, inst] : mapping) {
if (inst.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) {
continue; // downloading models are not from config sources, leave them alone
}
if (final_presets.find(name) == final_presets.end() && !inst.meta.is_running() && inst.th.joinable()) {
threads_to_join.push_back(std::move(inst.th));
}
@@ -465,7 +508,15 @@ void server_models::load_models() {
// erase models no longer in any source
for (auto it = mapping.begin(); it != mapping.end(); ) {
if (final_presets.find(it->first) == final_presets.end()) {
if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) {
++it; // download thread is still busy, skip
} else if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADED) {
// download finished, safe to erase
if (it->second.th.joinable()) {
it->second.th.join();
}
it = mapping.erase(it);
} else if (final_presets.find(it->first) == final_presets.end()) {
SRV_INF("(reload) removing model name=%s (no longer in source)\n", it->first.c_str());
GGML_ASSERT(!it->second.th.joinable()); // must have been joined above
it = mapping.erase(it);
@@ -526,6 +577,7 @@ void server_models::load_models() {
for (const auto & [name, preset] : final_presets) {
if (mapping.find(name) == mapping.end()) {
server_model_meta meta{
/* source */ get_source(name),
/* preset */ preset,
/* name */ name,
/* aliases */ {},
@@ -538,7 +590,7 @@ void server_models::load_models() {
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* multimodal */ mtmd_caps{false, false},
/* need_download */ false,
// /* need_download */ false,
};
add_model(std::move(meta));
newly_added.push_back(name);
@@ -571,6 +623,8 @@ void server_models::load_models() {
SRV_INF("(reload) loading new model %s\n", name.c_str());
load(name);
}
notify_sse("models_reload", "*");
}
}
@@ -597,7 +651,13 @@ bool server_models::has_model(const std::string & name) {
}
std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
std::unique_lock<std::mutex> lk(mutex);
if (need_reload) {
lk.unlock();
load_models();
lk.lock();
}
auto it = mapping.find(name);
if (it != mapping.end()) {
return it->second.meta;
@@ -683,7 +743,13 @@ static std::vector<char *> to_char_ptr_array(const std::vector<std::string> & ve
}
std::vector<server_model_meta> server_models::get_all_meta() {
std::lock_guard<std::mutex> lk(mutex);
std::unique_lock<std::mutex> lk(mutex);
if (need_reload) {
lk.unlock();
load_models();
lk.lock();
}
std::vector<server_model_meta> result;
result.reserve(mapping.size());
for (const auto & [name, inst] : mapping) {
@@ -770,7 +836,7 @@ void server_models::load(const std::string & name) {
throw std::runtime_error("failed to get a port number");
}
inst.subproc = std::make_shared<subprocess_s>();
inst.subproc = std::make_shared<server_subproc>();
{
SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
@@ -792,19 +858,20 @@ void server_models::load(const std::string & name) {
// TODO @ngxson : maybe separate stdout and stderr in the future
// so that we can use stdout for commands and stderr for logging
int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get());
inst.subproc->sproc.emplace();
int result = subprocess_create_ex(argv.data(), options, envp.data(), &inst.subproc->get());
if (result != 0) {
throw std::runtime_error("failed to spawn server instance");
}
inst.stdin_file = subprocess_stdin(inst.subproc.get());
inst.stdin_file = subprocess_stdin(&inst.subproc->get());
}
// start a thread to manage the child process
// captured variables are guaranteed to be destroyed only after the thread is joined
inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port, stop_timeout = inst.meta.stop_timeout]() {
FILE * stdin_file = subprocess_stdin(child_proc.get());
FILE * stdout_file = subprocess_stdout(child_proc.get()); // combined stdout/stderr
FILE * stdin_file = subprocess_stdin(&child_proc->get());
FILE * stdout_file = subprocess_stdout(&child_proc->get()); // combined stdout/stderr
std::thread log_thread([&]() {
// read stdout/stderr and forward to main server log
@@ -834,14 +901,14 @@ void server_models::load(const std::string & name) {
return this->stopping_models.find(name) != this->stopping_models.end();
};
auto should_wake = [&]() {
return is_stopping() || !subprocess_alive(child_proc.get());
return is_stopping() || !child_proc->is_alive();
};
{
std::unique_lock<std::mutex> lk(this->mutex);
this->cv_stop.wait(lk, should_wake);
}
// child may have already exited (e.g. crashed) — skip shutdown sequence
if (!subprocess_alive(child_proc.get())) {
if (!child_proc->is_alive()) {
return;
}
SRV_INF("stopping model instance name=%s\n", name.c_str());
@@ -859,7 +926,7 @@ void server_models::load(const std::string & name) {
if (elapsed >= stop_timeout * 1000) {
// timeout, force kill
SRV_WRN("force-killing model instance name=%s after %d seconds timeout\n", name.c_str(), stop_timeout);
subprocess_terminate(child_proc.get());
subprocess_terminate(&child_proc->get());
return;
}
this->cv_stop.wait_for(lk, std::chrono::seconds(1));
@@ -884,8 +951,8 @@ void server_models::load(const std::string & name) {
// get the exit code
int exit_code = 0;
subprocess_join(child_proc.get(), &exit_code);
subprocess_destroy(child_proc.get());
subprocess_join(&child_proc->get(), &exit_code);
subprocess_destroy(&child_proc->get());
// update status and exit code
this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
@@ -896,30 +963,118 @@ void server_models::load(const std::string & name) {
{
auto & old_instance = mapping[name];
// old process should have exited already, but just in case, we clean it up here
if (subprocess_alive(old_instance.subproc.get())) {
if (old_instance.subproc->is_alive()) {
SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str());
subprocess_terminate(old_instance.subproc.get()); // force kill
subprocess_terminate(&old_instance.subproc->get()); // force kill
}
if (old_instance.th.joinable()) {
old_instance.th.join();
}
}
notify_sse("model_status", name, {
{"status", server_model_status_to_string(inst.meta.status)},
});
mapping[name] = std::move(inst);
cv.notify_all();
}
// callback for model downloading functionality
struct server_models_download_res : public common_download_callback {
common_params_model model;
common_download_opts opts;
std::function<bool()> should_stop;
std::function<void(const common_download_progress & p)> on_progress;
bool is_ok = false;
bool run() {
try {
common_download_model(model, opts);
is_ok = true;
} catch (const std::exception & e) {
SRV_ERR("download failed for model name=%s: %s\n", model.name.c_str(), e.what());
is_ok = false;
}
return is_ok;
}
void on_start(const common_download_progress & p) override {
on_progress(p);
}
void on_update(const common_download_progress & p) override {
on_progress(p);
}
void on_done(const common_download_progress &, bool ok) override {
is_ok = ok;
}
bool is_cancelled() const override {
return should_stop();
}
};
void server_models::download(common_params_model && model, common_download_opts && opts) {
std::string name = model.name;
GGML_ASSERT(name == model.hf_repo);
std::unique_lock<std::mutex> lk(mutex);
if (mapping.find(name) != mapping.end()) {
throw std::runtime_error("model name=" + name + " already exists");
}
instance_t inst;
inst.meta.name = name;
inst.meta.status = SERVER_MODEL_STATUS_DOWNLOADING;
inst.subproc = std::make_shared<server_subproc>();
auto dl = std::make_unique<server_models_download_res>();
dl->model = model; // copy
dl->opts = opts; // copy
dl->should_stop = [sp = inst.subproc]() {
return sp->stop_download.load(std::memory_order_relaxed);
};
dl->on_progress = [this, name](const common_download_progress & p) {
update_download_progress(name, p, false);
};
inst.th = std::thread([this, dl = std::move(dl)]() {
dl->opts.callback = dl.get();
bool ok = dl->run();
SRV_INF("download finished for model name=%s with status=%s\n",
dl->model.name.c_str(), ok ? "success" : "failure");
update_download_progress(dl->model.name, {}, true, ok);
// need_reload is set inside update_download_progress under the mutex;
// the next load_models() call will clean up this instance
});
mapping[name] = std::move(inst);
notify_sse("status_update", name, {
{"status", server_model_status_to_string(SERVER_MODEL_STATUS_DOWNLOADING)},
});
cv.notify_all();
}
void server_models::unload(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
std::unique_lock<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
if (it->second.meta.is_running()) {
if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) {
SRV_INF("cancelling download for model name=%s\n", name.c_str());
it->second.subproc->stop_download.store(true, std::memory_order_relaxed);
// for convenience, we wait the status change here
wait(lk, name, [](const server_model_meta & new_meta) {
return new_meta.status != SERVER_MODEL_STATUS_DOWNLOADING;
});
} else if (it->second.meta.is_running()) {
SRV_INF("stopping model instance name=%s\n", name.c_str());
stopping_models.insert(name);
if (it->second.meta.status == SERVER_MODEL_STATUS_LOADING) {
// special case: if model is in loading state, unloading means force-killing it
SRV_WRN("model name=%s is still loading, force-killing\n", name.c_str());
subprocess_terminate(it->second.subproc.get());
subprocess_terminate(&it->second.subproc->get());
}
cv_stop.notify_all();
// status change will be handled by the managing thread
@@ -934,7 +1089,10 @@ void server_models::unload_all() {
{
std::lock_guard<std::mutex> lk(mutex);
for (auto & [name, inst] : mapping) {
if (inst.meta.is_running()) {
if (inst.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) {
SRV_INF("cancelling download for model name=%s\n", name.c_str());
inst.subproc->stop_download.store(true, std::memory_order_relaxed);
} else if (inst.meta.is_running()) {
SRV_INF("stopping model instance name=%s\n", name.c_str());
stopping_models.insert(name);
cv_stop.notify_all();
@@ -959,6 +1117,17 @@ void server_models::update_status(const std::string & name, server_model_status
meta.status = status;
meta.exit_code = exit_code;
}
// broadcast status change to SSE
{
json data = {
{"status", server_model_status_to_string(status)},
};
if (status == SERVER_MODEL_STATUS_UNLOADED) {
data["exit_code"] = exit_code;
}
// note: notify_sse doesn't acquire the lock, so no deadlock here
notify_sse("status_change", name, data);
}
cv.notify_all();
}
@@ -985,12 +1154,82 @@ void server_models::update_loaded_info(const std::string & name, std::string & r
cv.notify_all();
}
void server_models::wait_until_loading_finished(const std::string & name) {
std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [this, &name]() {
void server_models::update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok) {
json curr;
{
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
if (done) {
// mark the instance to be erased on next load_models() call
it->second.meta.status = SERVER_MODEL_STATUS_DOWNLOADED;
need_reload = true;
} else {
json & info = it->second.meta.loaded_info;
if (!info.contains("progress")) {
info["progress"] = json{};
}
info["progress"][progress.url] = {
{"done", progress.downloaded},
{"total", progress.total},
};
curr = it->second.meta.loaded_info; // copy
}
}
}
if (done) {
cv.notify_all(); // notify in case unload() is waiting for download to be cancelled
notify_sse(ok ? "download_finished" : "download_failed", name, {});
} else {
notify_sse("download_progress", name, curr);
}
}
bool server_models::remove(const std::string & name) {
auto meta = get_meta(name);
if (!meta.has_value()) {
throw std::runtime_error("model name=" + name + " is not found");
}
if (meta->source != SERVER_MODEL_SOURCE_CACHE) {
throw std::runtime_error("model name=" + name + " is not removable (not from cache)");
}
unload(name); // cancel download or stop running instance
{
std::unique_lock<std::mutex> lk(mutex);
// a cancelled download lands on DOWNLOADED; a stopped instance lands on UNLOADED
wait(lk, name, [](const server_model_meta & new_meta) {
return new_meta.status == SERVER_MODEL_STATUS_UNLOADED
|| new_meta.status == SERVER_MODEL_STATUS_DOWNLOADED;
});
// join before erasing - after status reaches UNLOADED/DOWNLOADED the thread no
// longer acquires this mutex, so joining while holding it is safe
if (mapping[name].th.joinable()) {
mapping[name].th.join();
}
// remove the model from disk (hold lock to prevent concurrent load)
bool ok = common_download_remove(name);
if (ok) {
mapping.erase(name);
}
SRV_INF("removing model name=%s from cache (%s)\n", name.c_str(), ok ? "succeeded" : "failed");
notify_sse("model_remove", name, {});
return ok;
}
}
void server_models::wait(const std::string & name, std::function<bool(const server_model_meta &)> predicate) {
std::unique_lock<std::mutex> lk(mutex);
wait(lk, name, predicate);
}
void server_models::wait(std::unique_lock<std::mutex> & lk, const std::string & name, std::function<bool(const server_model_meta &)> predicate) {
cv.wait(lk, [this, &name, &predicate]() {
auto it = mapping.find(name);
if (it != mapping.end()) {
return predicate(it->second.meta);
}
return false;
});
@@ -1014,10 +1253,15 @@ bool server_models::ensure_model_ready(const std::string & name) {
// wait for loading to complete
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
wait_until_loading_finished(name);
wait(name, [&meta](const server_model_meta & new_meta) {
if (new_meta.status != SERVER_MODEL_STATUS_LOADING) {
meta = new_meta; // update meta for final check after wait
return true;
}
return false;
});
// check final status
meta = get_meta(name);
if (!meta.has_value() || meta->is_failed()) {
throw std::runtime_error("model name=" + name + " failed to load");
}
@@ -1111,6 +1355,42 @@ void server_models::notify_router_sleeping_state(bool is_sleeping) {
// server_models_routes
//
// RAII wrapper similar to server_response_reader, but doesn't use server_queue
static std::atomic<int> sse_client_id_counter = 0;
struct server_models_sse_client {
server_response & queue_results;
int client_id;
server_models_sse_client(server_response & q)
: queue_results(q), client_id(sse_client_id_counter.fetch_add(1, std::memory_order_relaxed)) {
SRV_DBG("new SSE client connected, assigned client_id=%d\n", client_id);
queue_results.add_waiting_task_id(client_id);
}
~server_models_sse_client() {
SRV_DBG("SSE client disconnected, removing client_id=%d\n", client_id);
queue_results.remove_waiting_task_id(client_id);
}
// return nullptr if should_stop() is true before receiving a result
// note: if one error is received, it will stop further processing and return error result
server_task_result_ptr next(const std::function<bool()> & should_stop) {
while (true) {
static const int http_polling_seconds = 1; // check should_stop every 1 second
server_task_result_ptr result = queue_results.recv_with_timeout({client_id}, http_polling_seconds);
if (result == nullptr) {
// timeout, check stop condition
if (should_stop()) {
return nullptr;
}
// continue waiting otherwise
} else {
SRV_DBG("recv result for client_id=%d: %s\n", client_id, safe_json_to_str(result->to_json()).c_str());
return result;
}
}
// should not reach here
}
};
static void res_ok(std::unique_ptr<server_http_res> & res, const json & response_data) {
res->status = 200;
res->data = safe_json_to_str(response_data);
@@ -1274,7 +1554,9 @@ void server_models_routes::init_routes() {
{"created", t}, // for OAI-compat
{"status", status},
{"architecture", architecture},
{"need_download", meta.need_download},
{"source", server_model_source_to_string(meta.source)},
{"can_remove", meta.source == SERVER_MODEL_SOURCE_CACHE},
// {"need_download", meta.need_download},
// TODO: add other fields, may require reading GGUF metadata
};
@@ -1312,6 +1594,87 @@ void server_models_routes::init_routes() {
res_ok(res, {{"success", true}});
return res;
};
this->get_router_models_sse = [this](const server_http_req & req) {
auto res = std::make_unique<server_http_res>();
res->status = 200;
res->content_type = "text/event-stream";
auto sse_client = std::make_shared<server_models_sse_client>(models.sse);
res->next = [this, sse_client, &req](std::string & output) -> bool {
auto result = sse_client->next([&]() {
return stopping.load(std::memory_order_relaxed) || req.should_stop();
});
if (result == nullptr) {
return false; // client disconnected or should_stop
}
output = "data: " + safe_json_to_str(result->to_json()) + "\n\n";
return true; // listen for the next event
};
return res;
};
this->post_router_models = [this](const server_http_req & req) {
auto res = std::make_unique<server_http_res>();
json body = json::parse(req.body);
std::string name = json_value(body, "model", std::string());
if (name.empty()) {
throw std::invalid_argument("model must be a non-empty string");
}
common_params_model model;
common_download_opts opts;
model.name = name;
model.hf_repo = name;
opts.bearer_token = params.hf_token;
opts.download_mmproj = true;
opts.download_mtp = true;
// first, only check if the model is valid and can be downloaded
opts.skip_download = true;
bool ok = false;
try {
auto validation = common_download_model(model, opts);
ok = !validation.model_path.empty();
} catch (const common_skip_download_exception &) {
// model is valid and will be downloaded
ok = true;
} catch (...) {
SRV_ERR("unknown error while validating model '%s'\n", name.c_str());
// other exceptions will be handled by the outer ex_wrapper()
throw;
}
if (!ok) {
throw std::invalid_argument("model validation failed, unable to download");
}
// then, proceed with the actual download
opts.skip_download = false;
SRV_INF("starting download for model '%s'\n", name.c_str());
models.download(std::move(model), std::move(opts));
res_ok(res, {{"success", true}});
return res;
};
this->del_router_models = [this](const server_http_req & req) {
auto res = std::make_unique<server_http_res>();
std::string name = req.get_param("model");
if (name.empty()) {
throw std::invalid_argument("model must be a non-empty string");
}
bool ok = models.remove(name);
if (!ok) {
throw std::runtime_error("failed to remove model '" + name + "'");
}
res_ok(res, {{"success", true}});
return res;
};
}
+62 -27
View File
@@ -1,9 +1,11 @@
#pragma once
#include "common.h"
#include "download.h"
#include "preset.h"
#include "server-common.h"
#include "server-http.h"
#include "server-queue.h"
#include <mutex>
#include <condition_variable>
@@ -14,6 +16,8 @@
/**
* state diagram:
*
* DOWNLOADING DOWNLOADED (replaced by new instance)
*
* UNLOADED LOADING LOADED SLEEPING
*
* failed
@@ -22,39 +26,43 @@
*/
enum server_model_status {
// TODO: also add downloading state when the logic is added
SERVER_MODEL_STATUS_DOWNLOADING,
SERVER_MODEL_STATUS_DOWNLOADED,
SERVER_MODEL_STATUS_UNLOADED,
SERVER_MODEL_STATUS_LOADING,
SERVER_MODEL_STATUS_LOADED,
SERVER_MODEL_STATUS_SLEEPING
};
static server_model_status server_model_status_from_string(const std::string & status_str) {
if (status_str == "unloaded") {
return SERVER_MODEL_STATUS_UNLOADED;
}
if (status_str == "loading") {
return SERVER_MODEL_STATUS_LOADING;
}
if (status_str == "loaded") {
return SERVER_MODEL_STATUS_LOADED;
}
if (status_str == "sleeping") {
return SERVER_MODEL_STATUS_SLEEPING;
}
throw std::runtime_error("invalid server model status");
}
enum server_model_source {
SERVER_MODEL_SOURCE_PRESET,
SERVER_MODEL_SOURCE_MODELS_DIR,
SERVER_MODEL_SOURCE_CACHE,
};
static std::string server_model_status_to_string(server_model_status status) {
switch (status) {
case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
case SERVER_MODEL_STATUS_LOADING: return "loading";
case SERVER_MODEL_STATUS_LOADED: return "loaded";
case SERVER_MODEL_STATUS_SLEEPING: return "sleeping";
default: return "unknown";
case SERVER_MODEL_STATUS_DOWNLOADING: return "downloading";
case SERVER_MODEL_STATUS_DOWNLOADED: return "downloaded";
case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
case SERVER_MODEL_STATUS_LOADING: return "loading";
case SERVER_MODEL_STATUS_LOADED: return "loaded";
case SERVER_MODEL_STATUS_SLEEPING: return "sleeping";
default: return "unknown";
}
}
static std::string server_model_source_to_string(server_model_source source) {
switch (source) {
case SERVER_MODEL_SOURCE_PRESET: return "preset";
case SERVER_MODEL_SOURCE_MODELS_DIR: return "models_dir";
case SERVER_MODEL_SOURCE_CACHE: return "cache";
default: return "unknown";
}
}
struct server_model_meta {
server_model_source source = SERVER_MODEL_SOURCE_CACHE;
common_preset preset;
std::string name;
std::set<std::string> aliases; // additional names that resolve to this model
@@ -63,11 +71,11 @@ struct server_model_meta {
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0; // for LRU unloading
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
json loaded_info; // info to be reflected via /v1/models endpoint
json loaded_info; // info to be reflected via /v1/models endpoint ; if in DOWNLOADING state, it should contain download progress info
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
mtmd_caps multimodal; // multimodal capabilities
bool need_download = false; // whether the model needs to be downloaded before loading
// bool need_download = false; // whether the model needs to be downloaded before loading // TODO @ngxson: implement this
bool is_ready() const {
return status == SERVER_MODEL_STATUS_LOADED;
@@ -85,12 +93,15 @@ struct server_model_meta {
void update_caps();
};
struct subprocess_s;
struct server_models_routes;
struct server_subproc; // defined in server-models.cpp
struct server_models {
friend struct server_models_routes;
private:
struct instance_t {
std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
std::shared_ptr<server_subproc> subproc; // shared between main thread and monitoring thread
std::thread th;
server_model_meta meta;
FILE * stdin_file = nullptr;
@@ -107,6 +118,9 @@ private:
// set to true while load_models() is executing a reload; load() will wait until clear
bool is_reloading = false;
// if true, the next get_meta() will trigger a reload of model list
bool need_reload = false;
common_preset_context ctx_preset;
common_params base_params;
@@ -122,9 +136,14 @@ private:
// not thread-safe, caller must hold mutex
void add_model(server_model_meta && meta);
// notify SSE clients
void notify_sse(const std::string & event, const std::string & model_id, const json & data = nullptr);
public:
server_models(const common_params & params, int argc, char ** argv);
server_response sse; // for real-time updates via SSE endpoint
// (re-)load the list of models from various sources and prepare the metadata mapping
// - if this is called the first time, simply populate the metadata
// - if this is called subsequently (e.g. when refreshing from disk):
@@ -147,13 +166,24 @@ public:
void unload(const std::string & name);
void unload_all();
// download a new model, progress is reported via SSE
// to stop the download, call unload()
void download(common_params_model && model, common_download_opts && opts);
// update the status of a model instance (thread-safe)
void update_status(const std::string & name, server_model_status status, int exit_code);
void update_loaded_info(const std::string & name, std::string & raw_info);
void update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok = true);
// remove a cache model from disk and update the list (thread-safe)
// note: only cache models can be removed; returns false if the model doesn't exist or is not a cache model
bool remove(const std::string & name);
// wait until the model instance is fully loaded (thread-safe)
// note: predicate is called while holding the lock
// return when the model no longer in "loading" state
void wait_until_loading_finished(const std::string & name);
void wait(const std::string & name, std::function<bool(const server_model_meta &)> predicate);
void wait(std::unique_lock<std::mutex> & lk, const std::string & name, std::function<bool(const server_model_meta &)> predicate);
// ensure the model is in ready state (thread-safe)
// return false if model is ready
@@ -176,8 +206,9 @@ public:
struct server_models_routes {
common_params params;
json ui_settings = json::object(); // Primary: new name
json webui_settings = json::object(); // Deprecated: use ui_settings (kept for compat)
json ui_settings = json::object(); // Primary: new name
json webui_settings = json::object(); // Deprecated: use ui_settings (kept for compat)
std::atomic<bool> stopping = false; // for graceful disconnecting SSE clients during shutdown
server_models models;
server_models_routes(const common_params & params, int argc, char ** argv)
: params(params), models(params, argc, argv) {
@@ -206,6 +237,10 @@ struct server_models_routes {
server_http_context::handler_t get_router_models;
server_http_context::handler_t post_router_models_load;
server_http_context::handler_t post_router_models_unload;
// management API
server_http_context::handler_t get_router_models_sse;
server_http_context::handler_t post_router_models;
server_http_context::handler_t del_router_models;
};
/**
+11
View File
@@ -331,6 +331,17 @@ void server_response::send(server_task_result_ptr && result) {
}
}
void server_response::broadcast(server_task_result_ptr && result) {
std::unique_lock<std::mutex> lock(mutex_results);
for (const auto & id_task : waiting_task_ids) {
RES_DBG("task id = %d pushed to result queue\n", id_task);
server_task_result_ptr res_copy(result->clone());
res_copy->id = id_task; // override id with target task id
queue_results.emplace_back(std::move(res_copy));
}
condition_results.notify_all();
}
void server_response::terminate() {
running = false;
condition_results.notify_all();
+5 -1
View File
@@ -154,11 +154,15 @@ public:
// Send a new result to a waiting id_task
void send(server_task_result_ptr && result);
// broadcast a new result to all waiting tasks
// (used by router mode)
void broadcast(server_task_result_ptr && result);
// terminate the waiting loop
void terminate();
};
// utility class to make working with server_queue and server_response easier
// RAII wrapper to make working with server_queue and server_response easier
// it provides a generator-like API for server responses
// support pooling connection state and aggregating multiple results
struct server_response_reader {
+12
View File
@@ -312,6 +312,9 @@ struct server_task_result {
}
virtual json to_json() = 0;
virtual ~server_task_result() = default;
virtual server_task_result * clone() const {
GGML_ABORT("not implemented for this task type");
}
};
// using shared_ptr for polymorphism of server_task_result
@@ -649,3 +652,12 @@ struct server_prompt_cache {
void update();
};
// used exclusively by router mode
struct server_task_result_router : server_task_result {
json data;
virtual json to_json() override { return data; }
virtual server_task_result * clone() const override {
return new server_task_result_router(*this);
}
};
+8
View File
@@ -174,8 +174,11 @@ int llama_server(int argc, char ** argv) {
routes.get_props = models_routes->get_router_props;
routes.get_models = models_routes->get_router_models;
ctx_http.post("/models", ex_wrapper(models_routes->post_router_models));
ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load));
ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
ctx_http.get ("/models/sse", ex_wrapper(models_routes->get_router_models_sse));
ctx_http.del ("/models", ex_wrapper(models_routes->del_router_models));
}
ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
@@ -261,6 +264,7 @@ int llama_server(int argc, char ** argv) {
clean_up = [&models_routes]() {
SRV_INF("%s: cleaning up before exit...\n", __func__);
if (models_routes.has_value()) {
models_routes->stopping.store(true); // maybe redundant, but just to be safe
models_routes->models.unload_all();
}
llama_backend_free();
@@ -274,6 +278,10 @@ int llama_server(int argc, char ** argv) {
ctx_http.is_ready.store(true);
shutdown_handler = [&](int) {
if (models_routes.has_value()) {
// important to disconnect any SSE clients
models_routes->stopping.store(true);
}
ctx_http.stop();
};
+96
View File
@@ -1,3 +1,4 @@
import threading
import pytest
from utils import *
@@ -253,3 +254,98 @@ def test_router_reload_models():
assert "model-reload-c" in ids, "newly added model should appear"
finally:
os.remove(preset_path)
MODEL_DOWNLOAD_ID = "ggml-org/test-model-router-download:F16"
MODEL_DOWNLOAD_TIMEOUT = 300
def _listen_sse(server: ServerProcess, collected: list, stop: threading.Event):
"""Collect /models/sse events into `collected` until `stop` is set."""
url = f"http://{server.server_host}:{server.server_port}/models/sse"
try:
with requests.get(url, stream=True, timeout=MODEL_DOWNLOAD_TIMEOUT) as resp:
for line_bytes in resp.iter_lines():
if stop.is_set():
break
line = line_bytes.decode("utf-8")
if line.startswith("data: "):
collected.append(json.loads(line[6:]))
except Exception:
pass
def _wait_for_sse_event(collected: list, event_type: str, model: str, timeout: int) -> bool:
deadline = time.time() + timeout
while time.time() < deadline:
if any(e.get("event") == event_type and e.get("model") == model for e in collected):
return True
time.sleep(0.5)
return False
def test_router_download_model():
"""Case 1: download a model, verify SSE events and GET /models."""
global server
server.start()
# Ensure the model is not present before we start
server.make_request("DELETE", f"/models?model={MODEL_DOWNLOAD_ID}")
sse_events: list = []
stop = threading.Event()
sse_thread = threading.Thread(
target=_listen_sse, args=(server, sse_events, stop), daemon=True
)
sse_thread.start()
# Trigger the download
res = server.make_request("POST", "/models", data={"model": MODEL_DOWNLOAD_ID})
assert res.status_code == 200
assert res.body.get("success") is True
# Wait for download_finished SSE event
finished = _wait_for_sse_event(
sse_events, "download_finished", MODEL_DOWNLOAD_ID, MODEL_DOWNLOAD_TIMEOUT
)
stop.set()
assert finished, "Never received download_finished SSE event"
assert any(
e.get("event") == "download_progress" and e.get("model") == MODEL_DOWNLOAD_ID
for e in sse_events
), "No download_progress events received"
# Model should now appear in GET /models
ids = _get_model_ids(is_reload=False)
assert MODEL_DOWNLOAD_ID in ids, f"{MODEL_DOWNLOAD_ID} not found in /models after download"
def test_router_delete_model():
"""Case 2: delete the downloaded model, verify it disappears from GET /models."""
global server
server.start()
# Ensure the model exists (download it if needed)
if MODEL_DOWNLOAD_ID not in _get_model_ids(is_reload=False):
res = server.make_request("POST", "/models", data={"model": MODEL_DOWNLOAD_ID})
assert res.status_code == 200
sse_events: list = []
stop = threading.Event()
threading.Thread(
target=_listen_sse, args=(server, sse_events, stop), daemon=True
).start()
finished = _wait_for_sse_event(
sse_events, "download_finished", MODEL_DOWNLOAD_ID, MODEL_DOWNLOAD_TIMEOUT
)
stop.set()
assert finished, "Model did not finish downloading before delete test"
# Delete the model
del_res = server.make_request("DELETE", f"/models?model={MODEL_DOWNLOAD_ID}")
assert del_res.status_code == 200
assert del_res.body.get("success") is True
# Model should no longer appear in GET /models
ids = _get_model_ids(is_reload=False)
assert MODEL_DOWNLOAD_ID not in ids, f"{MODEL_DOWNLOAD_ID} still present after deletion"
+3
View File
@@ -340,6 +340,9 @@ class ServerProcess:
elif method == "POST":
response = requests.post(url, headers=headers, json=data, timeout=timeout)
parse_body = True
elif method == "DELETE":
response = requests.delete(url, headers=headers, timeout=timeout)
parse_body = True
elif method == "OPTIONS":
response = requests.options(url, headers=headers, timeout=timeout)
else:
+1 -1
View File
@@ -27,7 +27,7 @@ echo "Running pre-commit checks for llama-ui..."
# Format only staged files
staged_ui=$(git diff --cached --name-only -- tools/ui/)
if [ -n "$staged_ui" ]; then
echo "$staged_ui" | xargs npx --no-install prettier --write
echo "$staged_ui" | xargs npm run format
format_ok=$?
# Re-stage formatted files
git add tools/ui/
+1
View File
@@ -57,6 +57,7 @@ if [ $lint_ok -ne 0 ]; then
echo "❌ Lint failed"
exit 1
fi
if [ $test_ok -ne 0 ]; then
echo "❌ Tests failed"
exit 1