Compare commits

...

1 Commits

Author SHA1 Message Date
Neo Zhang 9724f664e8 [SYCL] rename GGML_SYCL_SUPPORT_LEVEL_ZERO (#24719)
* rename GGML_SYCL_SUPPORT_LEVEL_ZERO to GGML_SYCL_SUPPORT_LEVEL_ZERO_API, and GGML_SYCL_ENABLE_LEVEL_ZERO to  GGML_SYCL_USE_LEVEL_ZERO_API

* fix code format

* fix error when rebase
2026-06-18 11:18:26 +03:00
6 changed files with 32 additions and 32 deletions
+2 -2
View File
@@ -759,7 +759,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
| GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
| GGML_SYCL_SUPPORT_LEVEL_ZERO_API | ON *(default)* \|OFF *(Optional)* | Support to use Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
@@ -774,7 +774,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
| GGML_SYCL_USE_LEVEL_ZERO_API | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO_API=ON at build time. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+1 -1
View File
@@ -249,7 +249,7 @@ option(GGML_SYCL "ggml: use SYCL"
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON)
option(GGML_SYCL_SUPPORT_LEVEL_ZERO_API "ggml: use Level Zero API in SYCL backend" ON)
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
+5 -5
View File
@@ -39,8 +39,8 @@ if (WIN32)
set(CMAKE_CXX_COMPILER "icx")
set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
endif()
# Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled)
if(GGML_SYCL_SUPPORT_LEVEL_ZERO)
# Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO_API is enabled)
if(GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH})
set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH})
if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}")
@@ -105,8 +105,8 @@ endif()
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO_API ${GGML_SYCL_SUPPORT_LEVEL_ZERO_API}")
if (GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
# Link against Level Zero loader for direct device memory allocation.
# Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging
# in the xe kernel driver during multi-GPU inference.
@@ -114,7 +114,7 @@ if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH)
if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}")
message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}")
else()
+5 -5
View File
@@ -12,7 +12,7 @@
#include "common.hpp"
#include <sycl/backend.hpp>
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
#include <level_zero/ze_api.h>
#endif
@@ -84,9 +84,9 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
return sycl_down_blk_size;
}
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
return g_ggml_sycl_enable_level_zero &&
return g_ggml_sycl_use_level_zero_api &&
q.get_device().is_gpu() &&
q.get_backend() == sycl::backend::ext_oneapi_level_zero;
}
@@ -95,7 +95,7 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
if (ggml_sycl_use_level_zero_device_alloc(q)) {
void *ptr = nullptr;
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
@@ -127,7 +127,7 @@ void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
if (!ptr) return;
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
if (ggml_sycl_use_level_zero_device_alloc(q)) {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
zeMemFree(ze_ctx, ptr);
+1 -1
View File
@@ -324,7 +324,7 @@ struct ggml_tensor_extra_gpu {
optimize_feature optimized_feature;
};
extern int g_ggml_sycl_enable_level_zero;
extern int g_ggml_sycl_use_level_zero_api;
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
void ggml_sycl_free_device(void *ptr, sycl::queue &q);
+18 -18
View File
@@ -32,7 +32,7 @@
#include <sycl/sycl.hpp>
#include <sycl/backend.hpp>
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
#include <level_zero/ze_api.h>
#endif
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
@@ -87,7 +87,7 @@ int g_ggml_sycl_enable_vmm = 1;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_use_async_mem_op = 0;
int g_ggml_sycl_use_async_mem_op_requested = 1;
int g_ggml_sycl_enable_level_zero = 0;
int g_ggml_sycl_use_level_zero_api = 0;
int g_ggml_sycl_enable_flash_attention = 1;
int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
int g_ggml_sycl_usm_system = 0;
@@ -157,7 +157,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.ext_oneapi_level_zero = false;
}
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) {
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device.default_queue().get_device());
ze_device_properties_t props = {};
@@ -172,13 +172,13 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.default_tensor_split[id] /= total_vram;
}
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
// Large buffers can be allocated before ggml_check_sycl() initializes other
// g_ggml_sycl_enable_* globals, so initialize this one as early as we can.
g_ggml_sycl_enable_level_zero =
info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
g_ggml_sycl_use_level_zero_api =
info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_USE_LEVEL_ZERO_API", 1);
#else
g_ggml_sycl_enable_level_zero = 0;
g_ggml_sycl_use_level_zero_api = 0;
#endif
return info;
@@ -277,7 +277,7 @@ static void ggml_check_sycl() try {
g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL);
if (g_ggml_sycl_enable_level_zero == 0) {
if (g_ggml_sycl_use_level_zero_api == 0) {
g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
}
@@ -312,10 +312,10 @@ static void ggml_check_sycl() try {
#else
GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
#endif
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO_API)
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: yes\n");
#else
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: no\n");
#endif
#if defined(GGML_SYCL_USE_VMM)
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
@@ -331,12 +331,12 @@ static void ggml_check_sycl() try {
#else
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
#endif
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: %d\n", g_ggml_sycl_use_level_zero_api);
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy);
#else
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n",
GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: Disable Level Zero API usage by compile flag\n");
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO_API\n",
g_ggml_sycl_dev2dev_memcpy);
#endif
#if GGML_SYCL_DNNL
@@ -602,7 +602,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
static bool ggml_sycl_is_l0_discrete_gpu(int device) {
return ggml_sycl_info().devices[device].l0_discrete_gpu;
}
@@ -611,12 +611,12 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) {
static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) {
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
const bool l0_copy_supported =
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
if (g_ggml_sycl_use_level_zero_api && l0_copy_supported) {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,