mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-18 11:47:40 +02:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9724f664e8 | |||
| dd69db2924 | |||
| 6ec59ddaea | |||
| 32e806b9c1 | |||
| 6f1034b32a | |||
| 0b73fc79fe | |||
| 4a79037b8b | |||
| cae0a3b0b0 | |||
| f3e1828164 | |||
| 2e88c49c90 | |||
| 0843245cb1 |
@@ -46,11 +46,13 @@ jobs:
|
||||
|
||||
steps:
|
||||
- id: check
|
||||
env:
|
||||
COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
||||
echo "should_release=true" >> $GITHUB_OUTPUT
|
||||
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
|
||||
if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
|
||||
if echo "$COMMIT_MESSAGE" | grep -q '\[no release\]'; then
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "should_release=true" >> $GITHUB_OUTPUT
|
||||
@@ -542,6 +544,7 @@ jobs:
|
||||
steps:
|
||||
- name: Set OpenVINO version output
|
||||
id: openvino_version
|
||||
shell: bash
|
||||
run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Clone
|
||||
|
||||
+26
-13
@@ -20,16 +20,21 @@ int llama_fit_params(int argc, char ** argv);
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
|
||||
// hands the update over to the install script, which downloads and swaps the binary
|
||||
// Self-update is only supported for binaries built with llama-install.sh
|
||||
static int llama_update(int argc, char ** argv) {
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
|
||||
#ifdef LLAMA_INSTALL_BUILD
|
||||
#if defined(_WIN32)
|
||||
return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
|
||||
#else
|
||||
return system("curl -fsSL https://llama.app/install.sh | sh");
|
||||
#endif
|
||||
#else
|
||||
printf("Updates are available only when installed from https://llama.app\n");
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
static const char * progname;
|
||||
@@ -46,21 +51,29 @@ struct command {
|
||||
int (*func)(int, char **);
|
||||
};
|
||||
|
||||
#ifdef LLAMA_INSTALL_BUILD
|
||||
#define UPDATE_HIDDEN false
|
||||
#else
|
||||
#define UPDATE_HIDDEN true
|
||||
#endif
|
||||
|
||||
static const command cmds[] = {
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"update", "Update llama to the latest release", {}, false, llama_update },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
|
||||
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
|
||||
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
|
||||
{"quantize", "Quantize a model", {}, true, llama_quantize },
|
||||
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
|
||||
{"version", "Show version", {}, false, version },
|
||||
{"licenses", "Show third-party licenses", {"credits"}, false, licenses },
|
||||
{"help", "Show available commands", {}, false, help },
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"update", "Update llama to the latest release", {}, UPDATE_HIDDEN, llama_update },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
|
||||
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
|
||||
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
|
||||
{"quantize", "Quantize a model", {}, true, llama_quantize },
|
||||
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
|
||||
{"version", "Show version", {}, false, version },
|
||||
{"licenses", "Show third-party licenses", {"credits"}, false, licenses },
|
||||
{"help", "Show available commands", {}, false, help },
|
||||
};
|
||||
|
||||
#undef UPDATE_HIDDEN
|
||||
|
||||
static int version(int argc, char ** argv) {
|
||||
printf("%s\n", llama_build_info());
|
||||
return 0;
|
||||
|
||||
+60
-2
@@ -161,6 +161,64 @@ You could update your test result in it directly.
|
||||
|
||||
Please refer to [Docker with SYCL](../docker.md#docker-with-sycl) for details.
|
||||
|
||||
## Quick Development WOW
|
||||
|
||||
This chapter is for quick development & try with SYCL backend on Intel GPU.
|
||||
|
||||
You need to install following sofeware before development:
|
||||
- Intel GPU driver
|
||||
- oneAPI package
|
||||
- other development tools.
|
||||
|
||||
Please refer to [Linux](#linux) or [Windows](#windows-1) for above installation and resolve the trouble in usage. There are the detailed guide.
|
||||
|
||||
- Linux
|
||||
|
||||
```
|
||||
## build from source code
|
||||
./examples/sycl/build.sh
|
||||
|
||||
## run CONV_2D_DW unit test cases
|
||||
./build/bin/test-backend-ops -b SYCL0 -o CONV_2D_DW
|
||||
|
||||
## run all unit test cases
|
||||
./build/bin/test-backend-ops -b SYCL0
|
||||
|
||||
## run with LLM on the first GPU
|
||||
./examples/sycl/test.sh -mg 0 -m xxxx.gguf
|
||||
|
||||
## run service with LLM on the first GPU
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
||||
./examples/sycl/start-svr.sh -m xxxx.gguf
|
||||
|
||||
## update the docs/ops.md for new/update OPs
|
||||
./examples/sycl/update-ops-doc.sh
|
||||
```
|
||||
|
||||
- Windows
|
||||
|
||||
```
|
||||
## build from source code
|
||||
examples\sycl\win-build-sycl.bat
|
||||
|
||||
## run CONV_2D_DW unit test cases
|
||||
build\bin\test-backend-ops.exe -b SYCL0 -o CONV_2D_DW
|
||||
|
||||
## run all unit test cases
|
||||
build\bin\test-backend-ops.exe -b SYCL0
|
||||
|
||||
## run LLM on the first GPU
|
||||
examples\sycl\win-test.bat -mg 0 -m xxxx.gguf
|
||||
|
||||
## run service with LLM on the first GPU
|
||||
set ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
||||
examples\sycl\win-start-svr.bat -m xxxx.gguf
|
||||
|
||||
## update the docs/ops.md for new/update OPs
|
||||
examples\sycl\win-update-ops-doc.bat
|
||||
```
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### I. Setup Environment
|
||||
@@ -701,7 +759,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
|
||||
| GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
|
||||
| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
|
||||
| GGML_SYCL_SUPPORT_LEVEL_ZERO_API | ON *(default)* \|OFF *(Optional)* | Support to use Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
|
||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|
||||
@@ -716,7 +774,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
|
||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) |
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
|
||||
| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
|
||||
| GGML_SYCL_USE_LEVEL_ZERO_API | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO_API=ON at build time. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
|
||||
+3
-3
@@ -27,11 +27,11 @@ Legend:
|
||||
| COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
|
||||
+1582
-1582
File diff suppressed because it is too large
Load Diff
Executable
+9
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2026 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
./build/bin/test-backend-ops support --output csv > docs/ops/SYCL.csv
|
||||
./scripts/create_ops_docs.py
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
@echo off
|
||||
|
||||
rem MIT license
|
||||
rem Copyright (C) 2026 Intel Corporation
|
||||
rem SPDX-License-Identifier: MIT
|
||||
|
||||
build\bin\test-backend-ops support --output csv > docs\ops\SYCL.csv
|
||||
python scripts\create_ops_docs.py
|
||||
+1
-1
@@ -249,7 +249,7 @@ option(GGML_SYCL "ggml: use SYCL"
|
||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
||||
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
|
||||
option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON)
|
||||
option(GGML_SYCL_SUPPORT_LEVEL_ZERO_API "ggml: use Level Zero API in SYCL backend" ON)
|
||||
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
|
||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
"ggml: sycl target device")
|
||||
|
||||
@@ -438,7 +438,14 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
|
||||
ggml_add_cpu_backend_variant(power9 POWER9 VSX)
|
||||
ggml_add_cpu_backend_variant(power10 POWER10 VSX)
|
||||
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
|
||||
# POWER11 backend: only if compiler supports -mcpu=power11
|
||||
check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11)
|
||||
if (GGML_CXX_SUPPORTS_POWER11)
|
||||
message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend")
|
||||
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
|
||||
else()
|
||||
message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
|
||||
@@ -389,7 +389,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
|
||||
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
||||
|
||||
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
||||
if (EXTRACTED_NUMBER EQUAL 10 OR EXTRACTED_NUMBER EQUAL 11)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (EXTRACTED_NUMBER EQUAL 9)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power9)
|
||||
|
||||
@@ -1703,7 +1703,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm(ggml_metal_
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_ROPE);
|
||||
assert(op->op == GGML_OP_ROPE || op->op == GGML_OP_ROPE_BACK);
|
||||
|
||||
const bool is_back = op->op == GGML_OP_ROPE_BACK;
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
@@ -1727,13 +1729,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_
|
||||
snprintf(base, 256, "kernel_rope_norm_%s", ggml_type_name(op->src[0]->type));
|
||||
}
|
||||
|
||||
snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0);
|
||||
snprintf(name, 256, "%s_imrope=%d_is_back=%d", base, is_imrope ? 1 : 0, is_back ? 1 : 0);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0);
|
||||
ggml_metal_cv_set_bool(cv, is_back, FC_ROPE + 1);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
|
||||
@@ -1184,6 +1184,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_RMS_NORM:
|
||||
return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
return true;
|
||||
case GGML_OP_IM2COL:
|
||||
return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -375,6 +375,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
n_fuse = ggml_metal_op_norm(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
{
|
||||
n_fuse = ggml_metal_op_rope(ctx, idx);
|
||||
} break;
|
||||
|
||||
@@ -4358,6 +4358,7 @@ template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_
|
||||
#endif
|
||||
|
||||
constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]];
|
||||
constant bool FC_rope_is_back [[function_constant(FC_ROPE + 1)]];
|
||||
|
||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
@@ -4381,6 +4382,9 @@ static void rope_yarn(
|
||||
}
|
||||
*cos_theta = cos(theta) * mscale;
|
||||
*sin_theta = sin(theta) * mscale;
|
||||
if (FC_rope_is_back) {
|
||||
*sin_theta *= -1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
@@ -7553,7 +7557,9 @@ typedef decltype(kernel_concat<float>) kernel_concat_t;
|
||||
|
||||
template [[host_name("kernel_concat_f32")]] kernel kernel_concat_t kernel_concat<float>;
|
||||
template [[host_name("kernel_concat_f16")]] kernel kernel_concat_t kernel_concat<half>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat<bfloat>;
|
||||
#endif
|
||||
template [[host_name("kernel_concat_i8")]] kernel kernel_concat_t kernel_concat<char>;
|
||||
template [[host_name("kernel_concat_i16")]] kernel kernel_concat_t kernel_concat<short>;
|
||||
template [[host_name("kernel_concat_i32")]] kernel kernel_concat_t kernel_concat<int>;
|
||||
|
||||
@@ -39,8 +39,8 @@ if (WIN32)
|
||||
set(CMAKE_CXX_COMPILER "icx")
|
||||
set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
|
||||
endif()
|
||||
# Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled)
|
||||
if(GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
# Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO_API is enabled)
|
||||
if(GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
|
||||
if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH})
|
||||
set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH})
|
||||
if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}")
|
||||
@@ -105,8 +105,8 @@ endif()
|
||||
|
||||
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
|
||||
|
||||
message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
|
||||
if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO_API ${GGML_SYCL_SUPPORT_LEVEL_ZERO_API}")
|
||||
if (GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
|
||||
# Link against Level Zero loader for direct device memory allocation.
|
||||
# Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging
|
||||
# in the xe kernel driver during multi-GPU inference.
|
||||
@@ -114,7 +114,7 @@ if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH)
|
||||
if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
|
||||
target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
|
||||
message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}")
|
||||
message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}")
|
||||
else()
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
|
||||
#include "common.hpp"
|
||||
#include <sycl/backend.hpp>
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
#include <level_zero/ze_api.h>
|
||||
#endif
|
||||
|
||||
@@ -84,9 +84,9 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
|
||||
return sycl_down_blk_size;
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
|
||||
return g_ggml_sycl_enable_level_zero &&
|
||||
return g_ggml_sycl_use_level_zero_api &&
|
||||
q.get_device().is_gpu() &&
|
||||
q.get_backend() == sycl::backend::ext_oneapi_level_zero;
|
||||
}
|
||||
@@ -95,7 +95,7 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
|
||||
// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
|
||||
// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
|
||||
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
if (ggml_sycl_use_level_zero_device_alloc(q)) {
|
||||
void *ptr = nullptr;
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
|
||||
@@ -127,7 +127,7 @@ void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
|
||||
|
||||
void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
|
||||
if (!ptr) return;
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
if (ggml_sycl_use_level_zero_device_alloc(q)) {
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
|
||||
zeMemFree(ze_ctx, ptr);
|
||||
|
||||
@@ -324,7 +324,7 @@ struct ggml_tensor_extra_gpu {
|
||||
optimize_feature optimized_feature;
|
||||
};
|
||||
|
||||
extern int g_ggml_sycl_enable_level_zero;
|
||||
extern int g_ggml_sycl_use_level_zero_api;
|
||||
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
|
||||
void ggml_sycl_free_device(void *ptr, sycl::queue &q);
|
||||
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
#include "conv2d-dw.hpp"
|
||||
|
||||
struct conv2d_dw_params {
|
||||
int in_w, in_h;
|
||||
int out_w, out_h;
|
||||
int kernel_w, kernel_h;
|
||||
int stride_x, stride_y;
|
||||
int padding_x, padding_y;
|
||||
int dilation_x, dilation_y;
|
||||
int channels, batches;
|
||||
};
|
||||
|
||||
struct conv2d_dw_kernel_bounds {
|
||||
int y_min, y_max;
|
||||
int x_min, x_max;
|
||||
};
|
||||
|
||||
static inline conv2d_dw_kernel_bounds dw_calculate_kernel_bounds(int out_x, int out_y,
|
||||
const conv2d_dw_params & p) {
|
||||
conv2d_dw_kernel_bounds bounds;
|
||||
bounds.y_min = sycl::max(0, (p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
|
||||
bounds.y_max = sycl::min(p.kernel_h,
|
||||
(p.in_h + p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
|
||||
bounds.x_min = sycl::max(0, (p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
|
||||
bounds.x_max = sycl::min(p.kernel_w,
|
||||
(p.in_w + p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
static inline int dw_calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
|
||||
return out_coord * stride + kern_coord * dilation - padding;
|
||||
}
|
||||
|
||||
// whcn layout: input/output stored as [N, C, H, W]
|
||||
struct dw_whcn_layout {
|
||||
static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
|
||||
return n * (p.channels * p.in_w * p.in_h) + c * p.in_w * p.in_h + y * p.in_w + x;
|
||||
}
|
||||
static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
|
||||
return c * p.kernel_h * p.kernel_w + ky * p.kernel_w + kx;
|
||||
}
|
||||
static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
|
||||
return n * (p.channels * p.out_w * p.out_h) + c * p.out_w * p.out_h + y * p.out_w + x;
|
||||
}
|
||||
static void unpack_indices(int global_idx, const conv2d_dw_params & p,
|
||||
int & n, int & c, int & out_y, int & out_x) {
|
||||
out_x = global_idx % p.out_w;
|
||||
out_y = (global_idx / p.out_w) % p.out_h;
|
||||
c = (global_idx / (p.out_w * p.out_h)) % p.channels;
|
||||
n = global_idx / (p.out_w * p.out_h * p.channels);
|
||||
}
|
||||
};
|
||||
|
||||
// cwhn layout: input/output stored as [N, H, W, C]
|
||||
struct dw_cwhn_layout {
|
||||
static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
|
||||
return n * (p.channels * p.in_w * p.in_h) + (y * p.in_w + x) * p.channels + c;
|
||||
}
|
||||
static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
|
||||
return (ky * p.kernel_w + kx) * p.channels + c;
|
||||
}
|
||||
static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
|
||||
return n * (p.channels * p.out_w * p.out_h) + y * (p.out_w * p.channels) + x * p.channels + c;
|
||||
}
|
||||
static void unpack_indices(int global_idx, const conv2d_dw_params & p,
|
||||
int & n, int & c, int & out_y, int & out_x) {
|
||||
c = global_idx % p.channels;
|
||||
out_x = (global_idx / p.channels) % p.out_w;
|
||||
out_y = (global_idx / (p.channels * p.out_w)) % p.out_h;
|
||||
n = global_idx / (p.channels * p.out_w * p.out_h);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Layout>
|
||||
static void conv2d_dw_kernel(const float * input, const float * kernel, float * output,
|
||||
const conv2d_dw_params p, const sycl::nd_item<3> & item_ct1) {
|
||||
const int global_idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
const int total_elements = p.batches * p.channels * p.out_h * p.out_w;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
int n, c, out_y, out_x;
|
||||
Layout::unpack_indices(global_idx, p, n, c, out_y, out_x);
|
||||
|
||||
float acc = 0.0f;
|
||||
const conv2d_dw_kernel_bounds bounds = dw_calculate_kernel_bounds(out_x, out_y, p);
|
||||
|
||||
for (int ky = bounds.y_min; ky < bounds.y_max; ++ky) {
|
||||
const int in_y = dw_calculate_input_coord(out_y, ky, p.stride_y, p.dilation_y, p.padding_y);
|
||||
for (int kx = bounds.x_min; kx < bounds.x_max; ++kx) {
|
||||
const int in_x = dw_calculate_input_coord(out_x, kx, p.stride_x, p.dilation_x, p.padding_x);
|
||||
acc += input[Layout::input_index(n, c, in_y, in_x, p)] *
|
||||
kernel[Layout::kernel_index(c, ky, kx, p)];
|
||||
}
|
||||
}
|
||||
|
||||
output[Layout::output_index(n, c, out_y, out_x, p)] = acc;
|
||||
}
|
||||
|
||||
template <typename Layout>
|
||||
static void conv2d_dw_sycl(const float * x_d, const float * w_d, float * y_d,
|
||||
const conv2d_dw_params p, const queue_ptr & stream) {
|
||||
const int total = p.batches * p.channels * p.out_h * p.out_w;
|
||||
const int num_blocks = (total + SYCL_CONV2D_DW_BLOCK_SIZE - 1) / SYCL_CONV2D_DW_BLOCK_SIZE;
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_DW_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
conv2d_dw_kernel<Layout>(x_d, w_d, y_d, p, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float * w_d = (const float *) kernel->data;
|
||||
const float * x_d = (const float *) input->data;
|
||||
float * y_d = (float *) dst->data;
|
||||
|
||||
const int32_t * p = (const int32_t *) dst->op_params;
|
||||
const int stride_x = p[0];
|
||||
const int stride_y = p[1];
|
||||
const int padding_x = p[2];
|
||||
const int padding_y = p[3];
|
||||
const int dilation_x = p[4];
|
||||
const int dilation_y = p[5];
|
||||
|
||||
const int in_w = input->ne[0];
|
||||
const int in_h = input->ne[1];
|
||||
const int kernel_w = kernel->ne[0];
|
||||
const int kernel_h = kernel->ne[1];
|
||||
const int out_w = dst->ne[0];
|
||||
const int out_h = dst->ne[1];
|
||||
const int channels = dst->ne[2];
|
||||
const int batches = dst->ne[3];
|
||||
|
||||
const conv2d_dw_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h,
|
||||
stride_x, stride_y, padding_x, padding_y,
|
||||
dilation_x, dilation_y, channels, batches };
|
||||
|
||||
const queue_ptr stream = ctx.stream();
|
||||
|
||||
if (ggml_is_contiguous(input)) {
|
||||
conv2d_dw_sycl<dw_whcn_layout>(x_d, w_d, y_d, params, stream);
|
||||
} else if (ggml_is_contiguous_channels(input)) {
|
||||
conv2d_dw_sycl<dw_cwhn_layout>(x_d, w_d, y_d, params, stream);
|
||||
} else {
|
||||
GGML_ABORT("Unsupported memory layout for conv2d_dw");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
#ifndef GGML_SYCL_CONV2D_DW_HPP
|
||||
#define GGML_SYCL_CONV2D_DW_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#define SYCL_CONV2D_DW_BLOCK_SIZE 256
|
||||
|
||||
void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CONV2D_DW_HPP
|
||||
@@ -0,0 +1,125 @@
|
||||
#include "conv2d-transpose.hpp"
|
||||
#include "convert.hpp"
|
||||
|
||||
template <typename kernel_t>
|
||||
static void conv2d_transpose_kernel(const float * input, const kernel_t * kernel, float * output,
|
||||
const int in_w, const int in_h,
|
||||
const int out_w, const int out_h,
|
||||
const int kernel_w, const int kernel_h,
|
||||
const int stride,
|
||||
const int c_in, const int c_out, const int batches,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
const int global_idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
const int total_elements = out_w * out_h * c_out * batches;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int out_x = global_idx % out_w;
|
||||
const int out_y = (global_idx / out_w) % out_h;
|
||||
const int c_idx = (global_idx / (out_w * out_h)) % c_out;
|
||||
const int n_idx = global_idx / (out_w * out_h * c_out);
|
||||
|
||||
float acc = 0.0f;
|
||||
|
||||
for (int c_in_idx = 0; c_in_idx < c_in; ++c_in_idx) {
|
||||
for (int kh = 0; kh < kernel_h; ++kh) {
|
||||
int in_y = out_y - kh;
|
||||
if (in_y < 0 || in_y % stride) {
|
||||
continue;
|
||||
}
|
||||
in_y /= stride;
|
||||
if (in_y >= in_h) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int kw = 0; kw < kernel_w; ++kw) {
|
||||
int in_x = out_x - kw;
|
||||
if (in_x < 0 || in_x % stride) {
|
||||
continue;
|
||||
}
|
||||
in_x /= stride;
|
||||
if (in_x >= in_w) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + in_w * in_y + in_x;
|
||||
const int kernel_idx = (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx +
|
||||
kernel_w * kh + kw;
|
||||
|
||||
acc += input[input_idx] * ggml_sycl_cast<float>(kernel[kernel_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + out_w * out_y + out_x] = acc;
|
||||
}
|
||||
|
||||
template <typename kernel_t>
|
||||
static void conv2d_transpose_sycl(const float * input_d, const kernel_t * kernel_d, float * output_d,
|
||||
const int in_w, const int in_h,
|
||||
const int out_w, const int out_h,
|
||||
const int kernel_w, const int kernel_h,
|
||||
const int stride,
|
||||
const int c_in, const int c_out, const int batches,
|
||||
const queue_ptr & stream) {
|
||||
const int total = out_w * out_h * c_out * batches;
|
||||
const int num_blocks = (total + SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE;
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
conv2d_transpose_kernel<kernel_t>(input_d, kernel_d, output_d,
|
||||
in_w, in_h, out_w, out_h, kernel_w, kernel_h,
|
||||
stride, c_in, c_out, batches, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
// input: (W, H, C_in, N)
|
||||
// kernel: (W, H, C_out, C_in)
|
||||
// output: (W, H, C_out, N)
|
||||
void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(input));
|
||||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const float * input_d = (const float *) input->data;
|
||||
float * output_d = (float *) dst->data;
|
||||
const void * kernel_d = kernel->data;
|
||||
|
||||
const int input_w = input->ne[0];
|
||||
const int input_h = input->ne[1];
|
||||
const int channels_in = input->ne[2];
|
||||
const int batches = input->ne[3];
|
||||
const int output_w = dst->ne[0];
|
||||
const int output_h = dst->ne[1];
|
||||
const int channels_out = kernel->ne[2];
|
||||
const int kernel_w = kernel->ne[0];
|
||||
const int kernel_h = kernel->ne[1];
|
||||
const int stride = dst->op_params[0];
|
||||
|
||||
GGML_ASSERT(channels_in == kernel->ne[3]);
|
||||
GGML_ASSERT(stride > 0);
|
||||
|
||||
const queue_ptr stream = ctx.stream();
|
||||
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_transpose_sycl<sycl::half>(input_d, (const sycl::half *) kernel_d, output_d,
|
||||
input_w, input_h, output_w, output_h, kernel_w, kernel_h,
|
||||
stride, channels_in, channels_out, batches, stream);
|
||||
} else {
|
||||
conv2d_transpose_sycl<float>(input_d, (const float *) kernel_d, output_d,
|
||||
input_w, input_h, output_w, output_h, kernel_w, kernel_h,
|
||||
stride, channels_in, channels_out, batches, stream);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
#ifndef GGML_SYCL_CONV2D_TRANSPOSE_HPP
|
||||
#define GGML_SYCL_CONV2D_TRANSPOSE_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#define SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE 256
|
||||
|
||||
void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CONV2D_TRANSPOSE_HPP
|
||||
@@ -0,0 +1,150 @@
|
||||
#include "conv2d.hpp"
|
||||
#include "convert.hpp"
|
||||
|
||||
struct conv2d_params {
|
||||
const int64_t IW, IH;
|
||||
const int64_t OW, OH;
|
||||
const int64_t KW, KH;
|
||||
const int64_t ST_X, ST_Y;
|
||||
const int64_t PD_X, PD_Y;
|
||||
const int64_t DL_X, DL_Y;
|
||||
const int64_t IC, OC;
|
||||
const int64_t B;
|
||||
const int64_t TOTAL;
|
||||
};
|
||||
|
||||
struct conv2d_kernel_bounds {
|
||||
int64_t y_min, y_max;
|
||||
int64_t x_min, x_max;
|
||||
};
|
||||
|
||||
static inline int64_t conv2d_max64(int64_t a, int64_t b) {
|
||||
return (a > b) ? a : b;
|
||||
}
|
||||
|
||||
static inline int64_t conv2d_min64(int64_t a, int64_t b) {
|
||||
return (a < b) ? a : b;
|
||||
}
|
||||
|
||||
static inline conv2d_kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv2d_params & P) {
|
||||
conv2d_kernel_bounds bounds;
|
||||
bounds.y_min = conv2d_max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
|
||||
bounds.y_max = conv2d_min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
|
||||
bounds.x_min = conv2d_max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
|
||||
bounds.x_max = conv2d_min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
static inline int calculate_input_coord(int64_t out_coord, int64_t kern_coord, int64_t stride,
|
||||
int64_t dilation, int64_t padding) {
|
||||
return out_coord * stride + kern_coord * dilation - padding;
|
||||
}
|
||||
|
||||
// whcn layout helpers (matching ggml tensor memory order)
|
||||
static inline int64_t whcn_input_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
|
||||
return n * (P.IC * P.IW * P.IH) + c * P.IW * P.IH + y * P.IW + x;
|
||||
}
|
||||
|
||||
static inline int64_t whcn_kernel_index(int64_t c_out, int64_t c_in, int64_t ky, int64_t kx, const conv2d_params & P) {
|
||||
return c_out * (P.IC * P.KH * P.KW) + c_in * (P.KH * P.KW) + ky * P.KW + kx;
|
||||
}
|
||||
|
||||
static inline int64_t whcn_output_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
|
||||
return n * (P.OC * P.OW * P.OH) + c * P.OW * P.OH + y * P.OW + x;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void conv2d_kernel(const float * input, const T * kernel, float * output,
|
||||
const conv2d_params P, const sycl::nd_item<3> & item_ct1) {
|
||||
const int64_t global_idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
|
||||
if (global_idx >= P.TOTAL) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t out_x = global_idx % P.OW;
|
||||
const int64_t out_y = (global_idx / P.OW) % P.OH;
|
||||
const int64_t c_out = (global_idx / (P.OW * P.OH)) % P.OC;
|
||||
const int64_t n = global_idx / (P.OW * P.OH * P.OC);
|
||||
|
||||
float acc = 0.0f;
|
||||
|
||||
const conv2d_kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
|
||||
|
||||
for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
|
||||
for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
|
||||
const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
|
||||
for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
|
||||
const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
|
||||
const float input_val = input[whcn_input_index(n, c_in, in_y, in_x, P)];
|
||||
const T kernel_val = kernel[whcn_kernel_index(c_out, c_in, ky, kx, P)];
|
||||
acc += input_val * ggml_sycl_cast<float>(kernel_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output[whcn_output_index(n, c_out, out_y, out_x, P)] = acc;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void conv2d_sycl(const float * X_D, const T * K_D, float * Y_D,
|
||||
const conv2d_params P, const queue_ptr & stream) {
|
||||
const int num_blocks = (P.TOTAL + SYCL_CONV2D_BLOCK_SIZE - 1) / SYCL_CONV2D_BLOCK_SIZE;
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
conv2d_kernel<T>(X_D, K_D, Y_D, P, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
const float * K_D = (const float *) kernel->data;
|
||||
const float * X_D = (const float *) input->data;
|
||||
float * Y_D = (float *) dst->data;
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(input->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
// same number of input channels
|
||||
GGML_ASSERT(input->ne[2] == kernel->ne[2]);
|
||||
|
||||
const queue_ptr stream = ctx.stream();
|
||||
|
||||
const int32_t * p = (const int32_t *) dst->op_params;
|
||||
const int ST_X = p[0];
|
||||
const int ST_Y = p[1];
|
||||
const int PD_X = p[2];
|
||||
const int PD_Y = p[3];
|
||||
const int DL_X = p[4];
|
||||
const int DL_Y = p[5];
|
||||
|
||||
// no cwhn layout support
|
||||
GGML_ASSERT(p[6] == 0);
|
||||
|
||||
const int IW = input->ne[0];
|
||||
const int IH = input->ne[1];
|
||||
const int OW = dst->ne[0];
|
||||
const int OH = dst->ne[1];
|
||||
const int KW = kernel->ne[0];
|
||||
const int KH = kernel->ne[1];
|
||||
const int IC = input->ne[2];
|
||||
const int OC = kernel->ne[3];
|
||||
const int B = input->ne[3];
|
||||
|
||||
const int64_t total = (int64_t) B * OC * OH * OW;
|
||||
const conv2d_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
|
||||
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_sycl<sycl::half>(X_D, (const sycl::half *) K_D, Y_D, params, stream);
|
||||
} else {
|
||||
conv2d_sycl<float>(X_D, K_D, Y_D, params, stream);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
#ifndef GGML_SYCL_CONV2D_HPP
|
||||
#define GGML_SYCL_CONV2D_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#define SYCL_CONV2D_BLOCK_SIZE 256
|
||||
|
||||
void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CONV2D_HPP
|
||||
@@ -642,6 +642,8 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
@@ -724,6 +726,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
@@ -830,6 +834,8 @@ to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_nc_sycl<sycl::ext::oneapi::bfloat16>;
|
||||
#endif
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_nc_sycl<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_nc_sycl<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
||||
@@ -70,6 +70,21 @@ static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q1_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
|
||||
const int iqs, dfloat2 &v) {
|
||||
// Q1_0 reorder layout: scale values followed by quantized bits
|
||||
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
|
||||
|
||||
const int bit_index_0 = iqs + 0;
|
||||
const int bit_index_1 = iqs + 1;
|
||||
|
||||
const int bit_0 = (*((const uint8_t *)qs + bit_index_0 / 8) >> (bit_index_0 % 8)) & 1;
|
||||
const int bit_1 = (*((const uint8_t *)qs + bit_index_1 / 8) >> (bit_index_1 % 8)) & 1;
|
||||
|
||||
v.x() = (2 * bit_0 - 1) * d;
|
||||
v.y() = (2 * bit_1 - 1) * d;
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
@@ -1423,6 +1423,50 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q1_0_sycl_reorder(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_reorder<QK1_0, QR1_0, dequantize_q1_0_reorder>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q1_0_sycl(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK1_0, QR1_0, dequantize_q1_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
@@ -1759,6 +1803,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
sycl::half *src1_dfloat = nullptr; // dfloat == half
|
||||
|
||||
bool src1_convert_f16 =
|
||||
src0->type == GGML_TYPE_Q1_0 ||
|
||||
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
||||
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
||||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
|
||||
@@ -1777,6 +1822,14 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
#endif // GGML_SYCL_F16
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
dequantize_mul_mat_vec_q1_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
} else {
|
||||
dequantize_mul_mat_vec_q1_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/backend.hpp>
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
#include <level_zero/ze_api.h>
|
||||
#endif
|
||||
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||
@@ -62,6 +62,9 @@
|
||||
#include "ggml-sycl/repeat_back.hpp"
|
||||
#include "ggml-sycl/set_rows.hpp"
|
||||
#include "ggml-sycl/set.hpp"
|
||||
#include "ggml-sycl/conv2d.hpp"
|
||||
#include "ggml-sycl/conv2d-dw.hpp"
|
||||
#include "ggml-sycl/conv2d-transpose.hpp"
|
||||
#include "ggml-sycl/ssm_conv.hpp"
|
||||
#include "ggml-sycl/sycl_hw.hpp"
|
||||
#include "ggml-sycl/ssm_scan.hpp"
|
||||
@@ -84,7 +87,7 @@ int g_ggml_sycl_enable_vmm = 1;
|
||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||
int g_ggml_sycl_use_async_mem_op = 0;
|
||||
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
||||
int g_ggml_sycl_enable_level_zero = 0;
|
||||
int g_ggml_sycl_use_level_zero_api = 0;
|
||||
int g_ggml_sycl_enable_flash_attention = 1;
|
||||
int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
|
||||
int g_ggml_sycl_usm_system = 0;
|
||||
@@ -154,7 +157,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
info.ext_oneapi_level_zero = false;
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) {
|
||||
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device.default_queue().get_device());
|
||||
ze_device_properties_t props = {};
|
||||
@@ -169,13 +172,13 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
info.default_tensor_split[id] /= total_vram;
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
// Large buffers can be allocated before ggml_check_sycl() initializes other
|
||||
// g_ggml_sycl_enable_* globals, so initialize this one as early as we can.
|
||||
g_ggml_sycl_enable_level_zero =
|
||||
info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
|
||||
g_ggml_sycl_use_level_zero_api =
|
||||
info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_USE_LEVEL_ZERO_API", 1);
|
||||
#else
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
g_ggml_sycl_use_level_zero_api = 0;
|
||||
#endif
|
||||
|
||||
return info;
|
||||
@@ -274,7 +277,7 @@ static void ggml_check_sycl() try {
|
||||
g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
|
||||
g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL);
|
||||
if (g_ggml_sycl_enable_level_zero == 0) {
|
||||
if (g_ggml_sycl_use_level_zero_api == 0) {
|
||||
g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
|
||||
}
|
||||
|
||||
@@ -309,10 +312,10 @@ static void ggml_check_sycl() try {
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
|
||||
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: yes\n");
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: no\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
|
||||
@@ -328,12 +331,12 @@ static void ggml_check_sycl() try {
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
|
||||
#endif
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: %d\n", g_ggml_sycl_use_level_zero_api);
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n",
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: Disable Level Zero API usage by compile flag\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO_API\n",
|
||||
g_ggml_sycl_dev2dev_memcpy);
|
||||
#endif
|
||||
#if GGML_SYCL_DNNL
|
||||
@@ -599,7 +602,7 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
static bool ggml_sycl_is_l0_discrete_gpu(int device) {
|
||||
return ggml_sycl_info().devices[device].l0_discrete_gpu;
|
||||
}
|
||||
@@ -608,12 +611,12 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) {
|
||||
static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
|
||||
const void *ptr_src, size_t size) {
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
|
||||
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) {
|
||||
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
|
||||
const bool l0_copy_supported =
|
||||
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
|
||||
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
|
||||
if (g_ggml_sycl_use_level_zero_api && l0_copy_supported) {
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
|
||||
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
|
||||
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
|
||||
@@ -973,6 +976,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
|
||||
}
|
||||
|
||||
switch(type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
||||
@@ -3504,6 +3508,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
||||
|
||||
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
@@ -3519,6 +3524,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
||||
|
||||
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
@@ -3529,6 +3535,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
||||
|
||||
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -3543,6 +3550,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
||||
|
||||
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
@@ -4664,12 +4672,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_OP_ARGMAX:
|
||||
ggml_sycl_argmax(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_sycl_op_conv_transpose_1d(ctx, dst);
|
||||
case GGML_OP_CONV_2D:
|
||||
ggml_sycl_op_conv2d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
ggml_sycl_op_conv2d_dw(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_3D:
|
||||
ggml_sycl_conv_3d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_sycl_op_conv_transpose_1d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
ggml_sycl_op_conv2d_transpose(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
ggml_sycl_repeat(ctx, dst);
|
||||
break;
|
||||
@@ -5373,7 +5390,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
static bool do_ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_sycl_device_context *sycl_ctx =
|
||||
(ggml_backend_sycl_device_context *)dev->context;
|
||||
int device = sycl_ctx->device;
|
||||
@@ -5387,6 +5404,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
return true;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_SGN:
|
||||
@@ -5434,19 +5455,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
struct ggml_tensor * a = op->src[0];
|
||||
struct ggml_tensor * b = op->src[1];
|
||||
|
||||
// disable Q1_0 until implementation
|
||||
if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
|
||||
|
||||
|
||||
// TODO: The configuration below needs more work to be supported with oneDNN
|
||||
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
|
||||
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
|
||||
@@ -5456,12 +5470,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
||||
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
||||
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
||||
printf("zjy 2\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_OUT_PROD:
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
||||
return op->type == GGML_TYPE_F32 &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 ||
|
||||
(op->src[0]->type == GGML_TYPE_Q1_0 && op->src[0]->ne[2] == op->src[1]->ne[2] &&
|
||||
op->src[0]->ne[3] == op->src[1]->ne[3])) &&
|
||||
op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
@@ -5718,6 +5737,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
bool res = do_ggml_backend_sycl_device_supports_op(dev, op);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s op->op=%s op->type=%s -> %s\n", __func__, ggml_op_name(op->op),
|
||||
ggml_type_name(op->type), res ? "true" : "false");
|
||||
return res;
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) {
|
||||
return false;
|
||||
|
||||
@@ -1194,6 +1194,66 @@ static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols(
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy,
|
||||
float * dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK1_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK1_0, QI1_0, block_q1_0,
|
||||
VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <int ncols_dst>
|
||||
static void mul_mat_vec_q1_0_q8_1_sycl_ncols(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols, const int nrows,
|
||||
const int stride_col_y, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK1_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_ncols<QK1_0, QI1_0, block_q1_0,
|
||||
VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1, ncols_dst>(
|
||||
vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols, const int nrows, const int ncols_dst,
|
||||
const int stride_col_y, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
switch (ncols_dst) {
|
||||
case 1: mul_mat_vec_q1_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break;
|
||||
case 2: mul_mat_vec_q1_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 3: mul_mat_vec_q1_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 4: mul_mat_vec_q1_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 5: mul_mat_vec_q1_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 6: mul_mat_vec_q1_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 7: mul_mat_vec_q1_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 8: mul_mat_vec_q1_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
default: GGML_ABORT("unsupported ncols_dst=%d for Q1_0 multi-col MMVQ", ncols_dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
@@ -2120,6 +2180,20 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q1_0:
|
||||
if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
|
||||
const int stride_col_y = src1_padded_col_size / QK8_1;
|
||||
const int stride_col_dst = dst->ne[0];
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols);
|
||||
mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff,
|
||||
src1_ncols, stride_col_y, stride_col_dst, stream);
|
||||
return;
|
||||
} else if (i == 0 || src1_ncols == 1) {
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl\n");
|
||||
mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
|
||||
const int stride_col_y = src1_padded_col_size / QK8_1;
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#include "outprod.hpp"
|
||||
#include "convert.hpp"
|
||||
|
||||
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q1_0);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
@@ -20,11 +21,31 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
GGML_ASSERT(ne01 == ne11); // Inner dimensions must match
|
||||
GGML_ASSERT(ne0 == ne00); // Output rows match src0 rows
|
||||
GGML_ASSERT(ne1 == ne10); // Output cols match src1 cols
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
GGML_ASSERT(ne2 % ne02 == 0);
|
||||
GGML_ASSERT(ne3 % ne03 == 0);
|
||||
|
||||
// Get data pointers
|
||||
const float* src0_d = (const float*)src0->data;
|
||||
const float* src1_d = (const float*)src1->data;
|
||||
float* dst_d = (float*)dst->data;
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
ggml_sycl_pool_alloc<float> src0_as_f32(ctx.pool());
|
||||
int64_t src0_nb02 = nb02;
|
||||
int64_t src0_nb03 = nb03;
|
||||
if (src0->type == GGML_TYPE_Q1_0) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting src0 Q1_0 to fp32");
|
||||
src0_d = src0_as_f32.alloc(ne00 * ne01 * ne02 * ne03);
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
|
||||
GGML_ASSERT(to_fp32_sycl != nullptr);
|
||||
to_fp32_sycl(src0->data, const_cast<float *>(src0_d), ne00 * ne01 * ne02 * ne03, stream);
|
||||
|
||||
// Dequantized src0 buffer is contiguous fp32 [ne00, ne01, ne02, ne03].
|
||||
src0_nb02 = ne00 * ne01 * (int64_t) sizeof(float);
|
||||
src0_nb03 = ne00 * ne01 * ne02 * (int64_t) sizeof(float);
|
||||
}
|
||||
|
||||
// GEMM parameters
|
||||
const float alpha = 1.0f;
|
||||
@@ -35,12 +56,27 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
|
||||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
|
||||
const int64_t r2 = ne2 / ne02;
|
||||
const int64_t r3 = ne3 / ne03;
|
||||
|
||||
try {
|
||||
// Perform matrix multiplication using oneMKL GEMM
|
||||
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
|
||||
ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
}
|
||||
catch (sycl::exception const& exc) {
|
||||
// OUT_PROD applies independently to each (i2, i3) destination plane.
|
||||
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||
const int64_t i03 = i3 / r3;
|
||||
const int64_t i02 = i2 / r2;
|
||||
|
||||
const float * src0_plane = (const float *) ((const char *) src0_d + i02 * src0_nb02 + i03 * src0_nb03);
|
||||
const float * src1_plane = (const float *) ((const char *) src1_d + i2 * nb12 + i3 * nb13);
|
||||
float * dst_plane = (float *) ((char *) dst_d + i2 * nb2 + i3 * nb3);
|
||||
|
||||
// Perform matrix multiplication using oneMKL GEMM
|
||||
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
|
||||
ne0, ne1, ne01, alpha, src0_plane, ne00,
|
||||
src1_plane, ldb, beta, dst_plane, ne0);
|
||||
}
|
||||
}
|
||||
} catch (sycl::exception const& exc) {
|
||||
std::cerr << exc.what() << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
@@ -309,6 +309,41 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
|
||||
vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
|
||||
}
|
||||
|
||||
#define VDR_Q1_0_Q8_1_MMVQ 1
|
||||
#define VDR_Q1_0_Q8_1_MMQ 4
|
||||
|
||||
static __dpct_inline__ float
|
||||
vec_dot_q1_0_q8_1(const void *__restrict__ vbq,
|
||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||
|
||||
const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq;
|
||||
|
||||
const block_q8_1 * bq8_1_chunk = bq8_1 + iqs;
|
||||
const float d1 = bq1_0->d;
|
||||
const int v = get_int_from_uint8_aligned(bq1_0->qs, iqs);
|
||||
|
||||
int vi_bytes[8];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int shift = j * 4;
|
||||
const int bits4 = (v >> shift) & 0x0F;
|
||||
const int b0 = (bits4 & 0x01) ? 1 : -1;
|
||||
const int b1 = (bits4 & 0x02) ? 1 : -1;
|
||||
const int b2 = (bits4 & 0x04) ? 1 : -1;
|
||||
const int b3 = (bits4 & 0x08) ? 1 : -1;
|
||||
vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
|
||||
}
|
||||
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int u = get_int_from_int8_aligned(bq8_1_chunk->qs, j);
|
||||
sumi = ggml_sycl_dp4a(vi_bytes[j], u, sumi);
|
||||
}
|
||||
|
||||
return d1 * bq8_1_chunk->ds[0] * sumi;
|
||||
}
|
||||
|
||||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
||||
|
||||
|
||||
@@ -1105,6 +1105,8 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
|
||||
img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
|
||||
output.entries.push_back(std::move(res));
|
||||
}
|
||||
output.grid_x = inst.grid_size.width;
|
||||
output.grid_y = inst.grid_size.height;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1558,3 +1560,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
|
||||
output.entries.push_back(std::move(img_f32));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
// call super class preprocessor
|
||||
bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
if (output.entries.size() == 1) {
|
||||
// Single-tile (overview only): append one newline row.
|
||||
output.entries[0]->add_newline = true;
|
||||
} else {
|
||||
// Multi-tile: overview gets no newline, grid tiles get one.
|
||||
output.entries[0]->add_newline = false;
|
||||
for (size_t i = 1; i < output.entries.size(); ++i) {
|
||||
output.entries[i]->add_newline = true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -197,3 +197,9 @@ struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// similar to llava_uhd, but has add_newline
|
||||
struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
+6
-25
@@ -639,7 +639,7 @@ struct mtmd_context {
|
||||
{
|
||||
img_beg = "<image>";
|
||||
img_end = "";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||
@@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
|
||||
int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
|
||||
GGML_ASSERT(!bitmaps.empty());
|
||||
|
||||
if (!bitmaps[0]->is_audio) {
|
||||
// note: only one type of media is supported per call, caller should enforce this
|
||||
const bool is_vision = !bitmaps[0]->is_audio;
|
||||
|
||||
if (is_vision) {
|
||||
// handle image
|
||||
|
||||
if (!ctx->ctx_v) {
|
||||
@@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
|
||||
batch_f32.grid_y = tmp_batch.grid_y;
|
||||
}
|
||||
|
||||
// Annotate llava-next style tiles so clip_n_output_tokens accounts
|
||||
// for per-tile newline injection.
|
||||
if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
|
||||
if (batch_f32.entries.size() == 1) {
|
||||
// Single-tile (overview only): append one newline row.
|
||||
batch_f32.entries[0]->add_newline = true;
|
||||
} else {
|
||||
// Multi-tile: overview gets no newline, grid tiles get one.
|
||||
batch_f32.entries[0]->add_newline = false;
|
||||
for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
|
||||
batch_f32.entries[i]->add_newline = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handle llava-uhd style preprocessing
|
||||
const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
|
||||
if (
|
||||
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|
||||
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
|
||||
) {
|
||||
if (has_tiling_grid) {
|
||||
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
|
||||
GGML_ASSERT(bitmaps.size() == 1);
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ echo "Running pre-commit checks for llama-ui..."
|
||||
# Format only staged files
|
||||
staged_ui=$(git diff --cached --name-only -- tools/ui/)
|
||||
if [ -n "$staged_ui" ]; then
|
||||
echo "$staged_ui" | xargs npx --no-install prettier --write
|
||||
echo "$staged_ui" | xargs npm run format
|
||||
format_ok=$?
|
||||
# Re-stage formatted files
|
||||
git add tools/ui/
|
||||
|
||||
@@ -57,6 +57,7 @@ if [ $lint_ok -ne 0 ]; then
|
||||
echo "❌ Lint failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $test_ok -ne 0 ]; then
|
||||
echo "❌ Tests failed"
|
||||
exit 1
|
||||
|
||||
Reference in New Issue
Block a user