Compare commits

...

3 Commits

Author SHA1 Message Date
Dev-iL b4024af6c2 llama : skip main_gpu validation when no devices are available (#23405) 2026-06-17 17:30:26 +03:00
Ruixiang Wang 1a2dea29b9 spec: fix segfault error on long prompts for eagle3 (#24707) 2026-06-17 17:29:49 +03:00
Neo Zhang 74a80dd9c0 [SYCL] add dev2dev memcpy by SYCL API (#24476)
* add dev2dev memcpy by SYCL API

* mv GGML_SYCL_DEV2DEV_MEMCPY to runntime table

* update the detect method for p2p comm

* fix the erro created during fix confilct

---------

Co-authored-by: Neo Zhang <NA>
2026-06-17 17:21:34 +03:00
9 changed files with 78 additions and 27 deletions
+2
View File
@@ -712,6 +712,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DEV2DEV_MEMCPY | 0 (default) or 1 | Choose the SYCL or L0 API in dev2dev memory copy.<br>Value: <br>* 0: SYCL API (default)<br>* 1: L0 API -- L0 API is found to lead to abnormal crash in some case. This debug flag is used to check the issue.|
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
@@ -731,6 +732,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
| DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |
## Design Rule
- Open to all contributors.
+6
View File
@@ -62,6 +62,7 @@ extern int g_ggml_sycl_debug;
extern int g_ggml_sycl_disable_optimize;
extern int g_ggml_sycl_prioritize_dmmv;
extern int g_ggml_sycl_enable_flash_attention;
extern int g_ggml_sycl_dev2dev_memcpy;
#if defined(__clang__) && __has_builtin(__builtin_expect)
@@ -126,6 +127,11 @@ enum ggml_sycl_backend_gpu_mode {
SYCL_MUL_GPU_MODE
};
enum ggml_sycl_dev2dev_memcpy_mode {
DEV2DEV_MEMCPY_SYCL = 0,
DEV2DEV_MEMCPY_L0 = 1,
};
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
static void crash() {
+15 -7
View File
@@ -13,14 +13,14 @@
#ifndef GGML_SYCL_DPCT_HELPER_HPP
#define GGML_SYCL_DPCT_HELPER_HPP
#include <cstdlib>
#include <iostream>
#include <map>
#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
#include <oneapi/mkl.hpp>
#include <map>
#include "ggml.h"
#if defined(__linux__)
#include <sys/mman.h>
#elif defined(_WIN64)
@@ -43,6 +43,7 @@
#include <windows.h>
#endif
#define DPCT_COMPATIBILITY_TEMP (900)
#if defined(_MSC_VER)
@@ -59,6 +60,13 @@
#define __dpct_noinline__ __attribute__((noinline))
#endif
#define DPCT_UNUSED(x) (void)(x)
inline void _abort(const char * str) {
std::cerr << str << std::endl;
std::abort();
}
inline std::string get_device_type_name(const sycl::device &Device) {
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
switch (DeviceType) {
@@ -1017,7 +1025,7 @@ namespace dpct
if (backend == "opencl:cpu") return 4;
if (backend == "opencl:acc") return 5;
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
GGML_ABORT("fatal error");
_abort("fatal error");
}
static bool compare_backend(std::string &backend1, std::string &backend2) {
return convert_backend_index(backend1) < convert_backend_index(backend2);
@@ -1426,7 +1434,7 @@ namespace dpct
if (!size)
return sycl::event{};
return q.memcpy(to_ptr, from_ptr, size, dep_events);
GGML_UNUSED(direction);
DPCT_UNUSED(direction);
}
// Get actual copy range and make sure it will not exceed range.
@@ -2092,7 +2100,7 @@ namespace dpct
if (!size)
return sycl::event{};
return q.memcpy(to_ptr, from_ptr, size, dep_events);
GGML_UNUSED(direction);
DPCT_UNUSED(direction);
}
// Get actual copy range and make sure it will not exceed range.
+38 -14
View File
@@ -86,6 +86,7 @@ int g_ggml_sycl_use_async_mem_op = 0;
int g_ggml_sycl_use_async_mem_op_requested = 1;
int g_ggml_sycl_enable_level_zero = 0;
int g_ggml_sycl_enable_flash_attention = 1;
int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
int g_ggml_sycl_usm_system = 0;
static ggml_sycl_device_info ggml_sycl_init() {
@@ -272,6 +273,11 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_vmm = ggml_sycl_get_env("GGML_SYCL_ENABLE_VMM", 1);
g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL);
if (g_ggml_sycl_enable_level_zero == 0) {
g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
}
#ifdef SYCL_FLASH_ATTN
g_ggml_sycl_enable_flash_attention = ggml_sycl_get_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
#else
@@ -324,8 +330,11 @@ static void ggml_check_sycl() try {
#endif
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy);
#else
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n",
g_ggml_sycl_dev2dev_memcpy);
#endif
#if GGML_SYCL_DNNL
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
@@ -598,27 +607,42 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) {
static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
const bool l0_copy_supported = g_ggml_sycl_enable_level_zero &&
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
if (l0_copy_supported) {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
ze_command_list_handle_t cl;
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
if (r == ZE_RESULT_SUCCESS) {
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
zeCommandListDestroy(cl);
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) {
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
const bool l0_copy_supported =
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
ze_command_list_handle_t cl;
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
if (r == ZE_RESULT_SUCCESS) {
return;
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by L0\n");
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
zeCommandListDestroy(cl);
if (r == ZE_RESULT_SUCCESS) {
return;
}
}
}
}
#endif
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_SYCL) {
if (q_dst.get_device().ext_oneapi_can_access_peer(q_src.get_device(),
sycl::ext::oneapi::peer_access::access_supported)) {
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by SYCL\n");
SYCL_CHECK(CHECK_TRY_ERROR(q_dst.memcpy(ptr_dst, ptr_src, size).wait()));
return;
}
}
// Host-staged copy
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by host forward\n");
char *host_buf = (char *)malloc(size);
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
+1 -1
View File
@@ -1382,7 +1382,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
const auto & hparams = model.hparams;
// eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
const int64_t n_embd = hparams.n_embd_inp();
const int64_t n_embd = hparams.n_embd_inp_enc();
const int64_t n_vocab = model.vocab.n_tokens();
// note: during encode, we always pass the full sequence starting from pos = 0
+4
View File
@@ -104,6 +104,10 @@ uint32_t llama_hparams::n_embd_inp() const {
return n_embd_inp;
}
uint32_t llama_hparams::n_embd_inp_enc() const {
return n_embd_inp_enc_impl > 0 ? n_embd_inp_enc_impl : n_embd_inp();
}
uint32_t llama_hparams::n_embd_out() const {
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
}
+7
View File
@@ -189,6 +189,10 @@ struct llama_hparams {
// input embedding dimension (0 = use n_embd)
uint32_t n_embd_inp_impl = 0;
// encoder input embedding dimension (0 = use n_embd_inp())
// e.g. the eagle3 encoder fuses target_layers * target_hidden features
uint32_t n_embd_inp_enc_impl = 0;
// output embedding dimension (0 = use n_embd)
uint32_t n_embd_out_impl = 0;
@@ -305,6 +309,9 @@ struct llama_hparams {
// dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const;
// dimension of the encoder input embeddings
uint32_t n_embd_inp_enc() const;
// dimension of output embeddings
uint32_t n_embd_out() const;
+1 -1
View File
@@ -249,7 +249,7 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama
}
// if using single GPU mode, remove all except the main GPU
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
if (params.split_mode == LLAMA_SPLIT_MODE_NONE && !model->devices.empty()) {
if (params.main_gpu < 0) {
model->devices.clear();
} else {
+4 -4
View File
@@ -19,7 +19,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
// eagle3 norm_before_residual (optional, default false)
// compatible with Readhat eagle3 speculator model
@@ -34,7 +34,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
LLAMA_LOAD_LOCALS;
const int64_t n_embd_inp = hparams.n_embd_inp();
const int64_t n_embd_inp = hparams.n_embd_inp_enc();
const int64_t n_embd_attn_input = 2 * n_embd;
// Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
@@ -109,8 +109,8 @@ ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
// Input: Target model features (3 layers concatenated: low, mid, high)
// Data will be provided via ubatch->embd in encode_eagle3_features()
auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp_enc());
inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens);
ggml_set_input(inp_target->embd);
cur = inp_target->embd;