mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-25 15:17:41 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e8ecce53b8 | |||
| 683b04cc4a | |||
| f728adab68 | |||
| 3e61ea0e2f | |||
| fdbd6abee2 | |||
| e12a0128ab | |||
| b3ce5cedf4 | |||
| e9fb3b3fc0 |
@@ -222,6 +222,16 @@ if (LLAMA_BUILD_APP)
|
||||
add_subdirectory(app)
|
||||
endif()
|
||||
|
||||
# Standalone libmtmd build without pulling in the rest of the tools/ tree.
|
||||
# Useful when packaging just the mtmd library for language bindings (e.g. an
|
||||
# Apple XCFramework, or a WASM build). When the full tools build is enabled,
|
||||
# mtmd is already built by the tools/ subdirectory above; this hook only fires
|
||||
# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target.
|
||||
option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF)
|
||||
if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS))
|
||||
add_subdirectory(tools/mtmd)
|
||||
endif()
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
set(TARGET llama-app)
|
||||
|
||||
add_executable(${TARGET} llama.cpp)
|
||||
add_executable(${TARGET} llama.cpp download.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <filesystem>
|
||||
|
||||
static void print_usage(int /*argc*/, char ** argv) {
|
||||
printf(
|
||||
"\nexamples:\n"
|
||||
" %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n"
|
||||
" %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n"
|
||||
" %s -hf ggml-org/models -hff model.gguf\n"
|
||||
" %s -mu https://example.com/model.gguf -m model.gguf\n"
|
||||
"\n",
|
||||
argv[0], argv[0], argv[0], argv[0]
|
||||
);
|
||||
}
|
||||
|
||||
int llama_download(int argc, char ** argv);
|
||||
|
||||
int llama_download(int argc, char ** argv) {
|
||||
common_init();
|
||||
|
||||
common_params params;
|
||||
params.verbosity = LOG_LEVEL_ERROR;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() ||
|
||||
!params.model.path.empty() || !params.model.docker_repo.empty();
|
||||
if (!has_source) {
|
||||
fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
try {
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_DOWNLOAD, {});
|
||||
} catch (const std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!params.models_preset.empty()) {
|
||||
// -hf pointed at a preset repo: print the preset path and stop
|
||||
printf("%s\n", params.models_preset.c_str());
|
||||
return 0;
|
||||
}
|
||||
if (params.model.path.empty()) {
|
||||
fprintf(stderr, "error: model download failed\n");
|
||||
return 1;
|
||||
}
|
||||
if (!std::filesystem::exists(params.model.path)) {
|
||||
fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("%s\n", params.model.path.c_str());
|
||||
if (!params.mmproj.path.empty()) {
|
||||
printf("%s\n", params.mmproj.path.c_str());
|
||||
}
|
||||
if (!params.speculative.draft.mparams.path.empty()) {
|
||||
printf("%s\n", params.speculative.draft.mparams.path.c_str());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -19,6 +19,7 @@ int llama_batched_bench(int argc, char ** argv);
|
||||
int llama_fit_params(int argc, char ** argv);
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
int llama_download(int argc, char ** argv);
|
||||
|
||||
// Self-update is only supported for binaries built with llama-install.sh
|
||||
static int llama_update(int argc, char ** argv) {
|
||||
@@ -61,6 +62,7 @@ static const command cmds[] = {
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"update", "Update llama to the latest release", {}, UPDATE_HIDDEN, llama_update },
|
||||
{"download", "Download a model", {"get"}, false, llama_download },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
|
||||
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
|
||||
|
||||
@@ -13,6 +13,7 @@ LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
LLAMA_BUILD_SERVER=OFF
|
||||
LLAMA_BUILD_MTMD=ON
|
||||
GGML_METAL=ON
|
||||
GGML_METAL_EMBED_LIBRARY=ON
|
||||
GGML_BLAS_DEFAULT=ON
|
||||
@@ -39,6 +40,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
||||
-DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD}
|
||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||
-DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
|
||||
-DGGML_METAL=${GGML_METAL}
|
||||
@@ -126,6 +128,8 @@ setup_framework_structure() {
|
||||
cp ggml/include/ggml-cpu.h ${header_path}
|
||||
cp ggml/include/ggml-blas.h ${header_path}
|
||||
cp ggml/include/gguf.h ${header_path}
|
||||
cp tools/mtmd/mtmd.h ${header_path}
|
||||
cp tools/mtmd/mtmd-helper.h ${header_path}
|
||||
|
||||
# Create module map (common for all platforms)
|
||||
cat > ${module_path}module.modulemap << EOF
|
||||
@@ -247,6 +251,7 @@ combine_static_libraries() {
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
|
||||
"${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
|
||||
)
|
||||
|
||||
# Create temporary directory for processing
|
||||
|
||||
+33
-18
@@ -594,6 +594,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
const bool skip_model_download =
|
||||
// server will call common_params_handle_models() later, so we skip it here
|
||||
ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
|
||||
// download calls common_params_handle_models() itself and prints the paths
|
||||
ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD ||
|
||||
// export_graph_ops loads only metadata
|
||||
ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
|
||||
|
||||
@@ -671,15 +673,19 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
|
||||
common_options.push_back(&opt);
|
||||
}
|
||||
}
|
||||
printf("----- common params -----\n\n");
|
||||
print_options(common_options);
|
||||
printf("\n\n----- sampling params -----\n\n");
|
||||
print_options(sampling_options);
|
||||
printf("\n\n----- speculative params -----\n\n");
|
||||
print_options(spec_options);
|
||||
// TODO: maybe convert enum llama_example to string
|
||||
printf("\n\n----- example-specific params -----\n\n");
|
||||
print_options(specific_options);
|
||||
bool first = true;
|
||||
auto print_section = [&](const char * header, std::vector<common_arg *> & options) {
|
||||
if (options.empty()) {
|
||||
return;
|
||||
}
|
||||
printf("%s----- %s -----\n\n", first ? "" : "\n\n", header);
|
||||
first = false;
|
||||
print_options(options);
|
||||
};
|
||||
print_section("common params", common_options);
|
||||
print_section("sampling params", sampling_options);
|
||||
print_section("speculative params", spec_options);
|
||||
print_section("example-specific params", specific_options);
|
||||
}
|
||||
|
||||
static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
@@ -1079,7 +1085,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
||||
*/
|
||||
auto add_opt = [&](common_arg arg) {
|
||||
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
|
||||
// download only exposes the handful of args explicitly tagged for it
|
||||
const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD;
|
||||
if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) {
|
||||
ctx_arg.options.push_back(std::move(arg));
|
||||
}
|
||||
};
|
||||
@@ -1090,7 +1098,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.usage = true;
|
||||
}
|
||||
));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}));
|
||||
add_opt(common_arg(
|
||||
{"--version"},
|
||||
"show version and build info",
|
||||
@@ -2212,7 +2220,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, bool value) {
|
||||
params.no_mmproj = !value;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
|
||||
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-offload"},
|
||||
{"--no-mmproj-offload"},
|
||||
@@ -2611,14 +2619,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL"));
|
||||
add_opt(common_arg(
|
||||
{"-mu", "--model-url"}, "MODEL_URL",
|
||||
"model download url (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.url = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_MODEL_URL"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL"));
|
||||
add_opt(common_arg(
|
||||
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
|
||||
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
|
||||
@@ -2627,7 +2635,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.docker_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_DOCKER_REPO"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
||||
@@ -2637,14 +2645,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.hf_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_REPO"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hff", "--hf-file"}, "FILE",
|
||||
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.hf_file = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_FILE"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository for the vocoder model (default: unused)",
|
||||
@@ -2665,7 +2673,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.hf_token = value;
|
||||
}
|
||||
).set_env("HF_TOKEN"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN"));
|
||||
add_opt(common_arg(
|
||||
{"--mtp"},
|
||||
"also download the multi-token prediction (MTP) head, if available (default: unused)",
|
||||
[](common_params & params) {
|
||||
params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_DOWNLOAD}));
|
||||
add_opt(common_arg(
|
||||
{"--context-file"}, "FNAME",
|
||||
"file to load context from (use comma-separated values to specify multiple files)",
|
||||
|
||||
@@ -96,6 +96,7 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||
LLAMA_EXAMPLE_RESULTS,
|
||||
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
|
||||
LLAMA_EXAMPLE_DOWNLOAD,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
||||
@@ -136,6 +136,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"LlamaModel": "llama",
|
||||
"Eagle3DraftModel": "llama",
|
||||
"Eagle3Speculator": "llama",
|
||||
"Eagle3LlamaForCausalLM": "llama",
|
||||
"LlamaForCausalLMEagle3": "llama",
|
||||
"LlavaForConditionalGeneration": "llama",
|
||||
"LlavaStableLMEpochForCausalLM": "stablelm",
|
||||
|
||||
@@ -23,6 +23,7 @@ from .base import ModelBase, TextModel, gguf, logger
|
||||
"LlavaForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
"LlamaForCausalLMEagle3",
|
||||
"Eagle3LlamaForCausalLM",
|
||||
"Eagle3Speculator",
|
||||
"Eagle3DraftModel",
|
||||
"IQuestCoderForCausalLM",
|
||||
|
||||
@@ -413,6 +413,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
|
||||
|------------------|----------------------------------------|
|
||||
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||
| Multiple devices | --split-mode layer (default) |
|
||||
| Multiple devices | --split-mode tensor (tensor parallelism) |
|
||||
|
||||
`--split-mode tensor` (tensor parallelism) shards each layer across the selected
|
||||
GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
|
||||
left at its default `auto`, so `--split-mode tensor` works out of the box.
|
||||
Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
|
||||
context creation. The default `f16` KV cache is recommended. Tensor parallelism
|
||||
is currently optimized for 2 GPUs; other device counts fall back to a generic
|
||||
all-reduce.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -715,6 +724,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
|
||||
|------------------|----------------------------------------|
|
||||
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||
| Multiple devices | --split-mode layer (default) |
|
||||
| Multiple devices | --split-mode tensor (tensor parallelism) |
|
||||
|
||||
`--split-mode tensor` (tensor parallelism) shards each layer across the selected
|
||||
GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
|
||||
left at its default `auto`, so `--split-mode tensor` works out of the box.
|
||||
Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
|
||||
context creation. The default `f16` KV cache is recommended. Tensor parallelism
|
||||
is currently optimized for 2 GPUs; other device counts fall back to a generic
|
||||
all-reduce.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
+41
-1
@@ -13,6 +13,45 @@ The `llama-server` application supports several implementations of speculative d
|
||||
A much smaller model (called the _draft model_) generates drafts.
|
||||
A draft model is the most used approach in speculative decoding.
|
||||
|
||||
### EAGLE-3 (`draft-eagle3`)
|
||||
|
||||
EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it
|
||||
reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer
|
||||
trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft
|
||||
vocabulary with its own `lm_head`, which is mapped back using a `d2t` table.
|
||||
|
||||
Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer
|
||||
indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM`
|
||||
checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
|
||||
for `Qwen/Qwen3-4B`):
|
||||
|
||||
```bash
|
||||
python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \
|
||||
--target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf
|
||||
|
||||
llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3
|
||||
```
|
||||
|
||||
Supported EAGLE-3 draft models include:
|
||||
|
||||
- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B)
|
||||
- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B)
|
||||
- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3)
|
||||
- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3)
|
||||
- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3)
|
||||
- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3)
|
||||
- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3)
|
||||
- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
|
||||
- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3)
|
||||
- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3)
|
||||
- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3)
|
||||
- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3)
|
||||
- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3)
|
||||
- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16)
|
||||
- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context)
|
||||
|
||||
For the full and up-to-date list of supported models, see #18039.
|
||||
|
||||
### n-gram Cache (`ngram-cache`)
|
||||
|
||||
An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
|
||||
@@ -108,7 +147,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
### General Speculative Parameters
|
||||
|
||||
```
|
||||
--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
comma-separated list of types of speculative decoding to use
|
||||
(default: none)
|
||||
(env: LLAMA_ARG_SPEC_TYPE)
|
||||
@@ -247,6 +286,7 @@ Specifies a comma-separated list of speculative decoding types to use.
|
||||
|------|-------------|
|
||||
| `none` | No speculative decoding (default) |
|
||||
| `draft-simple` | Use a simple draft model for speculation |
|
||||
| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states |
|
||||
| `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
|
||||
| `ngram-cache` | Use n-gram cache lookup |
|
||||
| `ngram-simple` | Use simple n-gram pattern matching |
|
||||
|
||||
@@ -27,6 +27,14 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int de
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// Tensor parallelism (--split-mode tensor): comm_init/free/allreduce_tensor
|
||||
// trio queried by the meta-backend via ggml_backend_reg_get_proc_address.
|
||||
// See typedefs in ggml/include/ggml-backend.h. Mirrors the CUDA backend's
|
||||
// pattern (ggml_backend_cuda_comm_*).
|
||||
GGML_BACKEND_API void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends);
|
||||
GGML_BACKEND_API void ggml_backend_sycl_comm_free(void * comm_ctx);
|
||||
GGML_BACKEND_API bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx, struct ggml_tensor ** tensors);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
|
||||
|
||||
@@ -34,26 +34,26 @@ template <float (*bin_op)(const float, const float),
|
||||
static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const src1_t * src1,
|
||||
dst_t * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const uint32_t ne0,
|
||||
const uint32_t ne1,
|
||||
const uint32_t ne2,
|
||||
const uint3 ne3,
|
||||
const uint3 ne10,
|
||||
const uint3 ne11,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*const int s0,*/
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int s00,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s10,
|
||||
const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
/*const uint32_t s0,*/
|
||||
const uint32_t s1,
|
||||
const uint32_t s2,
|
||||
const uint32_t s3,
|
||||
const uint32_t s00,
|
||||
const uint32_t s01,
|
||||
const uint32_t s02,
|
||||
const uint32_t s03,
|
||||
const uint32_t s10,
|
||||
const uint32_t s11,
|
||||
const uint32_t s12,
|
||||
const uint32_t s13,
|
||||
src1_ptrs... src1s) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
@@ -61,7 +61,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
|
||||
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
|
||||
|
||||
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
|
||||
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
|
||||
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const uint32_t s0 = blockDim.x * gridDim.x;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
|
||||
for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) {
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
|
||||
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
|
||||
// protect i0 from overflow
|
||||
if (ne0 - i0 <= s0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*const int s0,*/
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int s00,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s10,
|
||||
const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
const uint32_t s1,
|
||||
const uint32_t s2,
|
||||
const uint32_t s3,
|
||||
const uint32_t s00,
|
||||
const uint32_t s01,
|
||||
const uint32_t s02,
|
||||
const uint32_t s03,
|
||||
const uint32_t s10,
|
||||
const uint32_t s11,
|
||||
const uint32_t s12,
|
||||
const uint32_t s13,
|
||||
src1_ptrs... src1s) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const uint32_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const uint32_t i3 = fastdiv(i, prod_012);
|
||||
const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
|
||||
@@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = fastmodulo(i1, ne11);
|
||||
const int i12 = fastmodulo(i2, ne12);
|
||||
const int i13 = fastmodulo(i3, ne13);
|
||||
const uint32_t i11 = fastmodulo(i1, ne11);
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
|
||||
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
|
||||
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = fastmodulo(i0, ne10);
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
|
||||
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
@@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_ASSERT(ne0 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne2 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne3 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
//GGML_ASSERT(s0 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s2 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s3 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(s00 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s01 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s02 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s03 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(s10 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s11 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s12 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s13 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(cne1[0] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[1] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[2] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[3] <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
@@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(ne2 * ne3 <= std::numeric_limits<unsigned int>::max());
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0 / 2LL, 1LL);
|
||||
@@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
|
||||
|
||||
if (block_nums.z > 65535 || block_nums.y > 65535) {
|
||||
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
int64_t block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
|
||||
GGML_ASSERT(block_num <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(block_num * block_size <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne0 * ne1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne0 * ne1 * ne2 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
|
||||
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));
|
||||
const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
|
||||
@@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
|
||||
{
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
|
||||
@@ -5859,6 +5859,250 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
|
||||
return ctx->devices[index];
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// Tensor parallelism (--split-mode tensor) for the SYCL backend.
|
||||
//
|
||||
// The meta-backend invokes these three entry points via get_proc_address:
|
||||
// * ggml_backend_sycl_comm_init - one-time per-graph setup
|
||||
// * ggml_backend_sycl_comm_allreduce_tensor - per-allreduce step
|
||||
// * ggml_backend_sycl_comm_free - tear-down
|
||||
//
|
||||
// For N=2 (dual-GPU), this is a degenerate ring allreduce with dual paths
|
||||
// chosen by tensor size:
|
||||
//
|
||||
// * Small (nelem < 32K): FP32 direct memcpy + per-device ADD
|
||||
// kernel. The kernel depends_on() its corresponding memcpy event
|
||||
// so it doesn't read partial data. Both devices run in parallel.
|
||||
//
|
||||
// * Large (nelem >= 32K): BF16-compressed. Each device compresses
|
||||
// its FP32 partial to BF16 locally, cross-device memcpys
|
||||
// to the peer (half the PCI bandwidth), where it is decompressed
|
||||
// and added into the local FP32 partial. 6 SYCL submissions per
|
||||
// allreduce (2 compress + 2 memcpy + 2 decompress-add) vs the
|
||||
// 4 for the small path, but the bandwidth saving > 6 GB/s PCIe x 2
|
||||
// dominates for larger tensors.
|
||||
//
|
||||
// Storage: A persistent uint8_t buffer per device, sized to
|
||||
// 4 * nelem bytes. Both paths reinterpret the same bytes (small path
|
||||
// as nelem floats; large path as outbox + inbox = 2*nelem uint16_t
|
||||
// each, using the full 4*nelem byte budget either way). Single
|
||||
// alloc+free per device keeps the SYCL pool's strict-LIFO invariant
|
||||
// trivial.
|
||||
//
|
||||
// For non-(N=2 FP32 contiguous) cases, comm_init or comm_allreduce_tensor
|
||||
// returns null/false, causing the meta-backend to use its generic
|
||||
// butterfly all-reduce fallback.
|
||||
// ==========================================================================
|
||||
|
||||
struct ggml_backend_sycl_comm_context {
|
||||
std::vector<ggml_backend_t> backends;
|
||||
// ONE persistent per-device byte buffer, 4*nelem bytes. Both the
|
||||
// FP32 small-tensor path and the BF16 large-tensor path share it
|
||||
// by reinterpreting.
|
||||
std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>> buf0;
|
||||
std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>> buf1;
|
||||
int64_t buf_nelem = 0;
|
||||
};
|
||||
|
||||
void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends) try {
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if (!ggml_backend_is_sycl(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Initial version: N=2 only. For N!=2, returning null makes the
|
||||
// meta-backend skip this backend-specific allreduce entirely.
|
||||
if (n_backends != 2) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * ctx = new ggml_backend_sycl_comm_context;
|
||||
ctx->backends.assign(backends, backends + n_backends);
|
||||
auto * sctx0 = (ggml_backend_sycl_context *) backends[0]->context;
|
||||
auto * sctx1 = (ggml_backend_sycl_context *) backends[1]->context;
|
||||
ctx->buf0 = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(sctx0->pool());
|
||||
ctx->buf1 = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(sctx1->pool());
|
||||
return ctx;
|
||||
}
|
||||
catch (const sycl::exception &) { return nullptr; }
|
||||
catch (...) { return nullptr; }
|
||||
|
||||
void ggml_backend_sycl_comm_free(void * comm_ctx_v) {
|
||||
auto * comm_ctx = static_cast<ggml_backend_sycl_comm_context *>(comm_ctx_v);
|
||||
if (comm_ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Sync both per-device queues so the pool_alloc destructors don't
|
||||
// return memory still in use by the last kernel.
|
||||
if (comm_ctx->backends.size() == 2) {
|
||||
auto * sctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context;
|
||||
auto * sctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context;
|
||||
try {
|
||||
sctx0->stream()->wait();
|
||||
sctx1->stream()->wait();
|
||||
} catch (...) { /* best effort during shutdown */ }
|
||||
}
|
||||
|
||||
delete comm_ctx;
|
||||
}
|
||||
|
||||
bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) try {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * comm_ctx = static_cast<ggml_backend_sycl_comm_context *>(comm_ctx_v);
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
|
||||
// Fast path: N=2, F32/F16, contiguous, matching shapes.
|
||||
if (n_backends != 2) {
|
||||
return false;
|
||||
}
|
||||
// Accept F32 or F16 inputs natively (types must match). F16 takes the
|
||||
// direct 2-byte memcpy + add path below; other types return false so the
|
||||
// meta-backend uses its generic all-reduce.
|
||||
if (tensors[0]->type != tensors[1]->type) {
|
||||
return false;
|
||||
}
|
||||
if (tensors[0]->type != GGML_TYPE_F32 && tensors[0]->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguous(tensors[0]) || !ggml_is_contiguous(tensors[1])) {
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(tensors[0]) != ggml_nelements(tensors[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int64_t nelem = ggml_nelements(tensors[0]);
|
||||
const size_t nbytes = ggml_nbytes(tensors[0]);
|
||||
if (nelem == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto * ctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context;
|
||||
auto * ctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context;
|
||||
queue_ptr q0 = ctx0->stream();
|
||||
queue_ptr q1 = ctx1->stream();
|
||||
|
||||
// Grow per-device byte buffers if needed (4 * nelem bytes each).
|
||||
if (comm_ctx->buf_nelem < nelem) {
|
||||
comm_ctx->buf0->realloc(nelem * 4);
|
||||
comm_ctx->buf1->realloc(nelem * 4);
|
||||
comm_ctx->buf_nelem = nelem;
|
||||
}
|
||||
uint8_t * buf0 = comm_ctx->buf0->get();
|
||||
uint8_t * buf1 = comm_ctx->buf1->get();
|
||||
|
||||
// F16 native path: direct 2-byte cross-device copy + add, skipping the
|
||||
// F32 round-trip the meta-backend fallback would force. Cross-device copies
|
||||
// go through dev2dev_memcpy because the two devices are in separate SYCL
|
||||
// contexts (a raw peer-USM q->memcpy would be a silent no-op).
|
||||
if (tensors[0]->type == GGML_TYPE_F16) {
|
||||
sycl::half * f16_out0 = (sycl::half *) tensors[0]->data;
|
||||
sycl::half * f16_out1 = (sycl::half *) tensors[1]->data;
|
||||
sycl::half * f16_tmp0 = (sycl::half *) buf0;
|
||||
sycl::half * f16_tmp1 = (sycl::half *) buf1;
|
||||
|
||||
q0->wait();
|
||||
q1->wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, f16_tmp0, tensors[1]->data, nbytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, f16_tmp1, tensors[0]->data, nbytes);
|
||||
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
f16_out0[i] = (sycl::half) ((float) f16_out0[i] + (float) f16_tmp0[i]);
|
||||
});
|
||||
});
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
f16_out1[i] = (sycl::half) ((float) f16_out1[i] + (float) f16_tmp1[i]);
|
||||
});
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
float * out0 = (float *) tensors[0]->data;
|
||||
float * out1 = (float *) tensors[1]->data;
|
||||
|
||||
// BF16 threshold: above this, the PCIe savings from halving the
|
||||
// cross-device bytes outweigh the 2 extra compress kernels.
|
||||
// Below: stay on the FP32 fast path. Threshold mirrors the CUDA
|
||||
// NCCL allreduce pattern for n_backends=2.
|
||||
static constexpr int64_t BF16_THRESHOLD = 32768;
|
||||
|
||||
if (nelem < BF16_THRESHOLD) {
|
||||
// FP32 small path: 4 SYCL submissions per allreduce.
|
||||
float * tmp0 = (float *) buf0;
|
||||
float * tmp1 = (float *) buf1;
|
||||
|
||||
// COMM-D2D-FIX: the two devices are in SEPARATE SYCL contexts, so a raw
|
||||
// q->memcpy of a peer USM pointer is a silent no-op. Route cross-device
|
||||
// copies through dev2dev_memcpy (L0 direct copy / host staging). It is
|
||||
// synchronous, so wait for the local partials to be produced first.
|
||||
q0->wait();
|
||||
q1->wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, tmp0, tensors[1]->data, nbytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, tmp1, tensors[0]->data, nbytes);
|
||||
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out0[i] += tmp0[i];
|
||||
});
|
||||
});
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out1[i] += tmp1[i];
|
||||
});
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
// BF16 large path: 6 SYCL submissions per allreduce, but the
|
||||
// cross-device memcpy is HALF the bytes. Pure bit-shift
|
||||
// conversion (no rounding) — matches ggml's truncating fp32->bf16.
|
||||
uint16_t * outbox0 = (uint16_t *) buf0;
|
||||
uint16_t * inbox0 = outbox0 + nelem;
|
||||
uint16_t * outbox1 = (uint16_t *) buf1;
|
||||
uint16_t * inbox1 = outbox1 + nelem;
|
||||
|
||||
// Phase A: compress each device's local partial in parallel.
|
||||
sycl::event c0 = q0->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
outbox0[i] = (uint16_t) (sycl::bit_cast<uint32_t>(out0[i]) >> 16);
|
||||
});
|
||||
|
||||
sycl::event c1 = q1->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
outbox1[i] = (uint16_t) (sycl::bit_cast<uint32_t>(out1[i]) >> 16);
|
||||
});
|
||||
|
||||
// Phase B: COMM-D2D-FIX-BF16 cross-device copy of compressed bytes via
|
||||
// dev2dev_memcpy (separate SYCL contexts; sync copy after compress).
|
||||
const size_t bf16_bytes = nelem * sizeof(uint16_t);
|
||||
c0.wait();
|
||||
c1.wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, inbox0, outbox1, bf16_bytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, inbox1, outbox0, bf16_bytes);
|
||||
|
||||
// Phase C: decompress + add into local FP32 partial.
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out0[i] += sycl::bit_cast<float>(((uint32_t) inbox0[i]) << 16);
|
||||
});
|
||||
});
|
||||
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out1[i] += sycl::bit_cast<float>(((uint32_t) inbox1[i]) << 16);
|
||||
});
|
||||
});
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (const sycl::exception &) { return false; }
|
||||
catch (...) { return false; }
|
||||
|
||||
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
|
||||
GGML_UNUSED(reg);
|
||||
|
||||
@@ -5866,6 +6110,17 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
|
||||
return (void *)ggml_backend_sycl_split_buffer_type;
|
||||
}
|
||||
|
||||
// Tensor parallelism (--split-mode tensor) entry points.
|
||||
if (strcmp(name, "ggml_backend_comm_init") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_init;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_free") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_free;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_allreduce_tensor") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_allreduce_tensor;
|
||||
}
|
||||
|
||||
// SYCL doesn't support registering host memory, left here for reference
|
||||
// "ggml_backend_register_host_buffer"
|
||||
// "ggml_backend_unregister_host_buffer"
|
||||
|
||||
+1
-1
@@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<t
|
||||
qs.has_tied_embeddings = false;
|
||||
}
|
||||
}
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer();
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer_all;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -146,6 +146,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
|
||||
|
||||
llama_synchronize(ctx.get());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
+23
-17
@@ -115,22 +115,28 @@ if (TARGET mtmd)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
||||
# Gate CLI binaries on LLAMA_BUILD_TOOLS so that standalone library-only
|
||||
# builds (LLAMA_BUILD_MTMD=ON with LLAMA_BUILD_TOOLS=OFF — e.g. Apple
|
||||
# XCFramework packaging) skip the executables entirely. LLAMA_BUILD_COMMON
|
||||
# defaults to ON in standalone builds, so we cannot rely on it for gating.
|
||||
if (LLAMA_BUILD_TOOLS)
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
||||
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable (${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable (${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
# mtmd-debug tool
|
||||
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
|
||||
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
|
||||
target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
|
||||
endif()
|
||||
target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
# mtmd-debug tool
|
||||
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
|
||||
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
|
||||
target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
|
||||
|
||||
+10
-1
@@ -14,6 +14,7 @@
|
||||
import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte';
|
||||
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
||||
import { chatStore } from '$lib/stores/chat.svelte';
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
import { RouterService } from '$lib/services/router.service';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import { TooltipSide } from '$lib/enums';
|
||||
@@ -34,6 +35,14 @@
|
||||
|
||||
const isStripExpanded = $derived(isExpandedMode || hoveredTooltip !== null);
|
||||
const isOnMobile = $derived(isMobile.current);
|
||||
const alwaysShowOnDesktop = $derived(config().alwaysShowSidebarOnDesktop as boolean);
|
||||
|
||||
// Keep the sidebar expanded on desktop when the user pins it open
|
||||
$effect(() => {
|
||||
if (alwaysShowOnDesktop && !isOnMobile) {
|
||||
isExpandedMode = true;
|
||||
}
|
||||
});
|
||||
|
||||
function toggleExpandedMode() {
|
||||
isExpandedMode = !isExpandedMode;
|
||||
@@ -183,7 +192,7 @@
|
||||
/>
|
||||
</div>
|
||||
|
||||
{#if isExpandedMode || isOnMobile}
|
||||
{#if isOnMobile || (isExpandedMode && !alwaysShowOnDesktop)}
|
||||
<div
|
||||
class="flex items-center transition-all duration-150 ease-out {isMobile.current &&
|
||||
!isExpandedMode
|
||||
|
||||
@@ -33,8 +33,6 @@
|
||||
import { SETTINGS_KEYS } from '$lib/constants';
|
||||
|
||||
let { children } = $props();
|
||||
let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
|
||||
let isDesktop = $derived(!isMobile.current);
|
||||
let innerHeight = $state<number | undefined>();
|
||||
let innerWidth = $state(browser ? window.innerWidth : 0);
|
||||
|
||||
@@ -164,12 +162,6 @@
|
||||
updateFavicon();
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
if (alwaysShowSidebarOnDesktop && isDesktop) {
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
||||
// Initialize server properties on app load (run once)
|
||||
$effect(() => {
|
||||
// Only fetch if we don't already have props
|
||||
|
||||
Reference in New Issue
Block a user