mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 10:46:43 +02:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 32120c10e3 | |||
| d5fb104293 | |||
| 635b65ad7a | |||
| e3a74b2990 | |||
| ac79caa7ce | |||
| fdd109883d | |||
| 4196b477da | |||
| ad39ccaa19 | |||
| 7dad2f1a17 | |||
| e36a602ba3 | |||
| 38d546330a | |||
| a1eb756c0b | |||
| 581e8eca8b | |||
| 0ae3f450f0 | |||
| e3cab403bf |
@@ -7,7 +7,7 @@ ARG APP_REVISION=N/A
|
||||
|
||||
FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
ARG GGML_SYCL_F16=ON
|
||||
ARG LEVEL_ZERO_VERSION=1.28.2
|
||||
ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
|
||||
RUN apt-get update && \
|
||||
@@ -24,7 +24,8 @@ COPY . .
|
||||
|
||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
echo "GGML_SYCL_F16 is set" \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
|
||||
&& export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \
|
||||
fi && \
|
||||
echo "Building with dynamic libs" && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
|
||||
|
||||
@@ -37,7 +37,7 @@ LLM inference in C/C++
|
||||
|
||||
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
|
||||
|
||||
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
|
||||
- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md)
|
||||
- Run with Docker - see our [Docker documentation](docs/docker.md)
|
||||
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
|
||||
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
|
||||
|
||||
@@ -103,6 +103,10 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
|
||||
data.grammar_triggers = {
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
|
||||
};
|
||||
if (autoparser.tools.format.openai_wrapper_trigger) {
|
||||
// model emits the OpenAI function wrapper, trigger on it
|
||||
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "{\"type\": \"function\"," });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -224,13 +228,13 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
||||
auto single_tool_parser = p.standard_json_tools(
|
||||
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
|
||||
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
|
||||
} else {
|
||||
tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
|
||||
}
|
||||
|
||||
// Handle content wrappers if present
|
||||
|
||||
@@ -181,6 +181,7 @@ struct tool_format_analysis {
|
||||
|
||||
bool fun_name_is_key = false; // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
|
||||
bool tools_array_wrapped = false; // Tool calls wrapped in JSON array [...]
|
||||
bool openai_wrapper_trigger = false; // model emits the OpenAI function wrapper, trigger on it
|
||||
|
||||
std::string function_field = "function";
|
||||
std::string name_field = "name";
|
||||
|
||||
@@ -165,6 +165,14 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// template uses the JSON {name, parameters} tool instruction, emits the OpenAI function wrapper
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("Respond in the format {\"name\": function name") != std::string::npos &&
|
||||
tmpl.src.find("Do not use variables.") != std::string::npos) {
|
||||
analysis.tools.format.openai_wrapper_trigger = true;
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: JSON name/parameters tool instruction]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
|
||||
});
|
||||
|
||||
|
||||
@@ -540,10 +540,11 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
||||
auto arg_name_parser = literal(prop_name);
|
||||
|
||||
common_peg_parser arg_value_parser = eps();
|
||||
auto string_value_parser = choice({
|
||||
literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
|
||||
literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
|
||||
});
|
||||
// Quoted literal as a value: normalize_quotes_to_json preserves escapes.
|
||||
auto string_value_parser = tool_arg_value(choice({
|
||||
literal("\"") + string_content('"') + literal("\""),
|
||||
literal("'") + string_content('\'') + literal("'")
|
||||
}));
|
||||
|
||||
if (is_string_type) {
|
||||
arg_value_parser = string_value_parser;
|
||||
@@ -745,7 +746,8 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||
const std::string & effective_args_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key,
|
||||
const std::vector<std::string> & parameters_order) {
|
||||
const std::vector<std::string> & parameters_order,
|
||||
bool accept_openai_wrapper) {
|
||||
|
||||
auto tool_choices = choice();
|
||||
auto name_key_parser = literal("\"" + effective_name_key + "\"");
|
||||
@@ -807,7 +809,13 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||
return idx_a < idx_b;
|
||||
});
|
||||
|
||||
auto ordered_body = tool_open(literal("{")) + space();
|
||||
// accept an optional leading "type": "function" field when the model emits the OpenAI wrapper
|
||||
common_peg_parser type_field = eps();
|
||||
if (accept_openai_wrapper) {
|
||||
type_field = optional(literal("\"type\"") + space() + literal(":") + space() +
|
||||
literal("\"function\"") + space() + literal(",") + space());
|
||||
}
|
||||
auto ordered_body = tool_open(literal("{")) + space() + type_field;
|
||||
for (size_t i = 0; i < parser_pairs.size(); i++) {
|
||||
ordered_body = ordered_body + parser_pairs[i].first;
|
||||
if (i < parser_pairs.size() - 1) {
|
||||
@@ -870,7 +878,8 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
|
||||
bool function_is_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key,
|
||||
const std::vector<std::string> & parameters_order) {
|
||||
const std::vector<std::string> & parameters_order,
|
||||
bool accept_openai_wrapper) {
|
||||
if (!tools.is_array() || tools.empty()) {
|
||||
return eps();
|
||||
}
|
||||
@@ -888,7 +897,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
|
||||
if (!name_spec.first.empty() || !args_spec.first.empty()) {
|
||||
tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
|
||||
} else {
|
||||
tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
|
||||
tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order, accept_openai_wrapper);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -120,7 +120,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
bool function_is_key = false,
|
||||
const std::string & call_id_key = "",
|
||||
const std::string & gen_call_id_key = "",
|
||||
const std::vector<std::string> & parameters_order = {});
|
||||
const std::vector<std::string> & parameters_order = {},
|
||||
bool accept_openai_wrapper = false);
|
||||
|
||||
// Legacy-compatible helper for building XML/tagged style tool calls
|
||||
// Used by tests and manual parsers
|
||||
@@ -157,7 +158,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
const std::string & effective_args_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key,
|
||||
const std::vector<std::string> & parameters_order);
|
||||
const std::vector<std::string> & parameters_order,
|
||||
bool accept_openai_wrapper);
|
||||
};
|
||||
|
||||
inline common_peg_arena build_chat_peg_parser(
|
||||
|
||||
+3
-2
@@ -2678,8 +2678,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
|
||||
effective_input.substr(result.end));
|
||||
LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str());
|
||||
LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str());
|
||||
throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format");
|
||||
}
|
||||
|
||||
common_chat_msg msg;
|
||||
|
||||
+22
-2
@@ -1507,6 +1507,7 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
|
||||
auto pieces = matcher.collect_prefix_and_next();
|
||||
|
||||
std::string pattern;
|
||||
std::string trailing; // optional proper-prefix of a delimiter, allowed only at the very end
|
||||
for (size_t i = 0; i < pieces.size(); ++i) {
|
||||
if (i > 0) {
|
||||
pattern += " | ";
|
||||
@@ -1522,13 +1523,32 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
|
||||
}
|
||||
|
||||
if (!pre.empty()) {
|
||||
pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
|
||||
std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
|
||||
pattern += pre_literal + " [^" + cls + "]";
|
||||
// Each interior alternative consumes a delimiter-prefix plus a disambiguating
|
||||
// char, so the repetition alone cannot match a value that *ends* on a proper
|
||||
// prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
|
||||
// "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
|
||||
// values, so without this the grammar would reject input the parser accepts.
|
||||
// Allow the value to terminate on any proper prefix as an optional tail.
|
||||
// This makes the grammar a slight superset of the runtime language (a value
|
||||
// may end on the longest prefix, which greedy first-match would not itself
|
||||
// produce); harmless for constrained generation, which only needs to admit
|
||||
// every runtime-valid string.
|
||||
if (!trailing.empty()) {
|
||||
trailing += " | ";
|
||||
}
|
||||
trailing += pre_literal;
|
||||
} else {
|
||||
pattern += "[^" + cls + "]";
|
||||
}
|
||||
}
|
||||
|
||||
return "(" + pattern + ")*";
|
||||
std::string result = "(" + pattern + ")*";
|
||||
if (!trailing.empty()) {
|
||||
result += " (" + trailing + ")?";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::unordered_set<std::string> collect_reachable_rules(
|
||||
|
||||
+30
-1
@@ -140,6 +140,8 @@ struct common_speculative_impl {
|
||||
size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
|
||||
size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
|
||||
|
||||
std::vector<size_t> n_acc_tokens_per_pos; // number of tokens accepted per draft position.
|
||||
|
||||
// TODO: track performance of most recent calls
|
||||
const bool gen_perf = true; // whether to generate performance stats.
|
||||
|
||||
@@ -2059,6 +2061,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
|
||||
|
||||
{
|
||||
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
|
||||
|
||||
if (impl->n_acc_tokens_per_pos.size() < n_accepted) {
|
||||
impl->n_acc_tokens_per_pos.resize(n_accepted, 0);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n_accepted; ++i) {
|
||||
impl->n_acc_tokens_per_pos[i]++;
|
||||
}
|
||||
|
||||
if (n_accepted > 0) {
|
||||
impl->n_acc_drafts++;
|
||||
impl->n_acc_tokens += n_accepted;
|
||||
@@ -2093,13 +2104,31 @@ void common_speculative_print_stats(const common_speculative * spec) {
|
||||
str_perf = "";
|
||||
}
|
||||
|
||||
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
|
||||
std::string str_stats;
|
||||
if (impl->n_call_accept > 0) {
|
||||
const double mean =
|
||||
1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept;
|
||||
std::ostringstream tmp;
|
||||
tmp << std::fixed << std::setprecision(3);
|
||||
for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) {
|
||||
if (i > 0) {
|
||||
tmp << ", ";
|
||||
}
|
||||
tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept;
|
||||
}
|
||||
std::ostringstream oss;
|
||||
oss << std::fixed << std::setprecision(2) << mean;
|
||||
str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
|
||||
}
|
||||
|
||||
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
|
||||
common_speculative_type_to_str(impl->type).c_str(),
|
||||
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
|
||||
impl->n_gen_drafts,
|
||||
impl->n_acc_drafts,
|
||||
impl->n_gen_tokens,
|
||||
impl->n_acc_tokens,
|
||||
str_stats.c_str(),
|
||||
str_perf.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
+12
-10
@@ -253,6 +253,7 @@ When targeting an intel GPU, the user should expect one or more devices among th
|
||||
#### Intel GPU
|
||||
|
||||
```sh
|
||||
# Uses FP32, consider using FP16 for better performance in most cases
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
@@ -262,12 +263,12 @@ or
|
||||
# Export relevant ENV variables
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# Option 2: Use FP16
|
||||
# Option 1: Use FP16 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||
|
||||
# Option 2: Use FP32
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# build all binary
|
||||
cmake --build build --config Release -j -v
|
||||
```
|
||||
@@ -469,6 +470,7 @@ Choose one of following methods to build from source code.
|
||||
##### Option 1: Script
|
||||
|
||||
```sh
|
||||
# Uses FP32, consider using FP16 for better performance in most cases
|
||||
.\examples\sycl\win-build-sycl.bat
|
||||
```
|
||||
|
||||
@@ -479,11 +481,11 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
||||
```
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
# Option 1: Use FP16 (recommended for better performance in most cases)
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
|
||||
# Option 2: Or FP16
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
# Option 2: Or FP32
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
cmake --build build --config Release -j
|
||||
```
|
||||
@@ -491,10 +493,10 @@ cmake --build build --config Release -j
|
||||
Or, use CMake presets to build:
|
||||
|
||||
```sh
|
||||
cmake --preset x64-windows-sycl-release
|
||||
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-completion
|
||||
|
||||
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||
cmake --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-completion
|
||||
|
||||
cmake --preset x64-windows-sycl-debug
|
||||
|
||||
+30
-2
@@ -1,12 +1,40 @@
|
||||
# Install pre-built version of llama.cpp
|
||||
|
||||
| Install via | Windows | Mac | Linux |
|
||||
|-------------|---------|-----|-------|
|
||||
| Install via | Windows | Mac | Linux |
|
||||
|-------------|---------|------|-------|
|
||||
| conda-forge | ✅ | ✅ | ✅ |
|
||||
| Winget | ✅ | | |
|
||||
| Homebrew | | ✅ | ✅ |
|
||||
| MacPorts | | ✅ | |
|
||||
| Nix | | ✅ | ✅ |
|
||||
|
||||
## conda-forge (Windows, Mac and Linux)
|
||||
|
||||
conda-forge provides builds for:
|
||||
- CUDA (Windows and Linux)
|
||||
- Vulkan (Windows and Linux)
|
||||
- Apple Metal (macOS)
|
||||
|
||||
```sh
|
||||
conda install -c conda-forge llama-cpp
|
||||
```
|
||||
|
||||
```sh
|
||||
mamba install -c conda-forge llama-cpp
|
||||
```
|
||||
|
||||
```sh
|
||||
# Project-local installation
|
||||
pixi add llama-cpp
|
||||
|
||||
# Global installation
|
||||
pixi global install llama-cpp
|
||||
```
|
||||
|
||||
This distribution is managed on [`conda-forge/llama-cpp-feedstock`](https://github.com/conda-forge/llama.cpp-feedstock/).
|
||||
|
||||
Shall you have any problems, please open an issue on [its issue tracker](https://github.com/conda-forge/llama.cpp-feedstock/issues).
|
||||
|
||||
## Winget (Windows)
|
||||
|
||||
```sh
|
||||
|
||||
+4
-4
@@ -44,10 +44,10 @@ Legend:
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -89,7 +89,7 @@ Legend:
|
||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -118,6 +118,6 @@ Legend:
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
+37
-35
@@ -27,20 +27,20 @@
|
||||
"SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
@@ -69,20 +69,20 @@
|
||||
"SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
@@ -111,8 +111,8 @@
|
||||
"SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
@@ -153,20 +153,20 @@
|
||||
"SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","SYCL"
|
||||
"SYCL0","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","SYCL"
|
||||
"SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","SYCL"
|
||||
@@ -5105,6 +5105,7 @@
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=bf16,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","SYCL"
|
||||
@@ -5112,6 +5113,7 @@
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=bf16,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","1","yes","SYCL"
|
||||
@@ -9748,10 +9750,10 @@
|
||||
"SYCL0","COS","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
|
||||
"SYCL0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","0","no","SYCL"
|
||||
"SYCL0","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
|
||||
"SYCL0","SQR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
"SYCL0","SQR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
@@ -9766,14 +9768,14 @@
|
||||
"SYCL0","CLAMP","type=f16,ne=[1024,1024,1,1],min=-0.500000,max=0.500000","support","0","no","SYCL"
|
||||
"SYCL0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","SYCL"
|
||||
"SYCL0","LEAKY_RELU","type=f16,ne_a=[1024,1024,1,1],negative_slope=0.100000","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
|
||||
"SYCL0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","SYCL"
|
||||
"SYCL0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -287,6 +287,13 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
|
||||
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
|
||||
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
||||
main_stream);
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) {
|
||||
op()((const sycl::ext::oneapi::bfloat16 *) src0->data, (const sycl::ext::oneapi::bfloat16 *) src1->data,
|
||||
(sycl::ext::oneapi::bfloat16 *) dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2,
|
||||
ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, ggml_is_contiguous(src0),
|
||||
ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
|
||||
#endif
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include "concat.hpp"
|
||||
|
||||
static inline size_t elem_size(ggml_type t) {
|
||||
@@ -192,11 +194,29 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
case GGML_TYPE_F32:
|
||||
concat_impl_sycl<float>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
concat_impl_sycl<sycl::half>(ctx, dst);
|
||||
break;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
case GGML_TYPE_BF16:
|
||||
concat_impl_sycl<sycl::ext::oneapi::bfloat16>(ctx, dst);
|
||||
break;
|
||||
#endif
|
||||
case GGML_TYPE_I32:
|
||||
concat_impl_sycl<int32_t>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_I16:
|
||||
concat_impl_sycl<int16_t>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_I64:
|
||||
concat_impl_sycl<int64_t>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_I8:
|
||||
concat_impl_sycl<int8_t>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type");
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s\n", __func__, ggml_type_name(dst->type));
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
+133
-1
@@ -1022,6 +1022,120 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_k_reorder(const void *__restrict__ vx,
|
||||
const float *__restrict__ yy,
|
||||
float *__restrict__ dst,
|
||||
const int ncols, int nrows,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int row = item_ct1.get_group(2);
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
|
||||
// SOA base pointers for the reordered layout:
|
||||
// [qs: nb * QK_K/2] [qh: nb * QK_K/8] [scales: nb * K_SCALE_SIZE] [dm: nb * sizeof(half2)]
|
||||
const int nb = nrows * num_blocks_per_row;
|
||||
const uint8_t * qs_base = (const uint8_t *)vx;
|
||||
const uint8_t * qh_base = qs_base + (size_t)nb * (QK_K / 2);
|
||||
const uint8_t * scales_base = qh_base + (size_t)nb * (QK_K / 8);
|
||||
const sycl::half2 * dm_base = (const sycl::half2 *)(scales_base + (size_t)nb * K_SCALE_SIZE);
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2) / 2; // 0...15
|
||||
const int ix = item_ct1.get_local_id(2) % 2;
|
||||
|
||||
const int il = tid/4; // 0...3
|
||||
const int ir = tid - 4*il;// 0...3
|
||||
const int n = 2;
|
||||
|
||||
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const int in = il%2;
|
||||
|
||||
const int l0 = n*(2*ir + in);
|
||||
const int q_offset = 32*im + l0;
|
||||
const int y_offset = 64*im + l0;
|
||||
|
||||
const uint8_t hm1 = 1 << (2*im);
|
||||
const uint8_t hm2 = hm1 << 4;
|
||||
|
||||
uint16_t aux[4];
|
||||
const uint8_t * sc = (const uint8_t *)aux;
|
||||
|
||||
uint16_t q16[8];
|
||||
const uint8_t * q4 = (const uint8_t *)q16;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
const int bi = ib0 + i;
|
||||
|
||||
const uint8_t * ql1 = qs_base + bi * (QK_K / 2) + q_offset;
|
||||
const uint8_t * qh = qh_base + bi * (QK_K / 8) + l0;
|
||||
const float * y1 = yy + i*QK_K + y_offset;
|
||||
const float * y2 = y1 + 128;
|
||||
|
||||
const sycl::half2 dm_val = dm_base[bi];
|
||||
const float dall = dm_val[0];
|
||||
const float dmin = dm_val[1];
|
||||
|
||||
const uint16_t * a = (const uint16_t *)(scales_base + bi * K_SCALE_SIZE);
|
||||
aux[0] = a[im+0] & kmask1;
|
||||
aux[1] = a[im+2] & kmask1;
|
||||
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
||||
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
||||
|
||||
sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
|
||||
float smin = 0;
|
||||
const uint16_t * q1 = (const uint16_t *)ql1;
|
||||
const uint16_t * q2 = q1 + 32;
|
||||
q16[0] = q1[0] & 0x0f0f;
|
||||
q16[1] = q1[8] & 0x0f0f;
|
||||
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
||||
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
||||
q16[4] = q2[0] & 0x0f0f;
|
||||
q16[5] = q2[8] & 0x0f0f;
|
||||
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
||||
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
||||
for (int l = 0; l < n; ++l) {
|
||||
sum.x() +=
|
||||
y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
|
||||
y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
|
||||
sum.y() +=
|
||||
y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
|
||||
y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
|
||||
sum.z() +=
|
||||
y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
|
||||
y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
|
||||
sum.w() +=
|
||||
y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
|
||||
y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
|
||||
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
||||
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
||||
}
|
||||
tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
|
||||
sum.w() * sc[5]) -
|
||||
dmin * smin;
|
||||
}
|
||||
#else
|
||||
// The reordered Q5_K layout is only produced for QK_K == 256.
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
|
||||
if (item_ct1.get_local_id(2) == 0) {
|
||||
dst[row] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
@@ -1599,6 +1713,19 @@ static void dequantize_mul_mat_vec_q4_K_sycl_reorder(const void *vx, const float
|
||||
});
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_K_sycl_reorder(const void *vx, const float *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q5_k_reorder(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_K_sycl_reorder(const void *vx, const float *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
@@ -1695,7 +1822,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
dequantize_mul_mat_vec_q5_K_sycl_reorder(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
} else {
|
||||
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
|
||||
@@ -124,6 +124,11 @@ static __dpct_inline__ T op_exp(T x) {
|
||||
return sycl::exp(x);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __dpct_inline__ T op_expm1(T x) {
|
||||
return sycl::expm1(x);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __dpct_inline__ T op_log(T x) {
|
||||
if (x <= static_cast<T>(0)) {
|
||||
@@ -266,13 +271,6 @@ static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl:
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
dst[i] = op_floor(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
@@ -280,20 +278,6 @@ static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
dst[i] = op_round(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
dst[i] = op_trunc(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void clamp(const T * x, T * dst, const float min, const float max, const int k,
|
||||
const sycl::nd_item<1> &item_ct1) {
|
||||
@@ -605,6 +589,12 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_expm1(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
@@ -728,16 +718,9 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1);
|
||||
});
|
||||
});
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_floor(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
@@ -747,29 +730,15 @@ static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1);
|
||||
});
|
||||
});
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_round(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1);
|
||||
});
|
||||
});
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_trunc(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
@@ -1018,6 +987,11 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_exp(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_expm1(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_log(ctx, dst);
|
||||
|
||||
@@ -59,6 +59,8 @@ void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -3685,6 +3685,149 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reorder each expert slice into a self-contained SoA layout.
|
||||
static bool reorder_qw_q4_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(expert_bytes % sizeof(block_q4_K) == 0);
|
||||
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q4_K));
|
||||
const size_t total_bytes = expert_bytes * (size_t) n_expert;
|
||||
|
||||
sycl_reorder_temp_buffer tmp(stream, total_bytes);
|
||||
if (!tmp) {
|
||||
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
|
||||
return false;
|
||||
}
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
const int total_blocks = blocks_per_expert * (int) n_expert;
|
||||
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
|
||||
const int gb = gb_;
|
||||
const int e = gb / blocks_per_expert;
|
||||
const int ib = gb % blocks_per_expert;
|
||||
const block_q4_K * x = (const block_q4_K *) (tmp_buf + (size_t) e * expert_bytes);
|
||||
uint8_t * base = data_device + (size_t) e * expert_bytes;
|
||||
|
||||
auto * qs_ptr = base;
|
||||
auto * scales_ptr = qs_ptr + QK_K / 2 * blocks_per_expert;
|
||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert);
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
||||
}
|
||||
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
||||
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
||||
}
|
||||
dm_ptr[ib] = x[ib].dm;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reorder each Q5_K expert slice into [qs][qh][scales][dm].
|
||||
static bool reorder_qw_q5_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(expert_bytes % sizeof(block_q5_K) == 0);
|
||||
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q5_K));
|
||||
const size_t total_bytes = expert_bytes * (size_t) n_expert;
|
||||
|
||||
sycl_reorder_temp_buffer tmp(stream, total_bytes);
|
||||
if (!tmp) {
|
||||
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
|
||||
return false;
|
||||
}
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
const int total_blocks = blocks_per_expert * (int) n_expert;
|
||||
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
|
||||
const int gb = gb_;
|
||||
const int e = gb / blocks_per_expert;
|
||||
const int ib = gb % blocks_per_expert;
|
||||
const block_q5_K * x = (const block_q5_K *) (tmp_buf + (size_t) e * expert_bytes);
|
||||
uint8_t * base = data_device + (size_t) e * expert_bytes;
|
||||
|
||||
auto * qs_ptr = base;
|
||||
auto * qh_ptr = qs_ptr + (QK_K / 2) * blocks_per_expert;
|
||||
auto * scales_ptr = qh_ptr + (QK_K / 8) * blocks_per_expert;
|
||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert);
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
||||
}
|
||||
for (int j = 0; j < QK_K / 8; ++j) {
|
||||
qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
|
||||
}
|
||||
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
||||
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
||||
}
|
||||
dm_ptr[ib] = x[ib].dm;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reorder each Q6_K expert slice into [ql][qh][scales][d].
|
||||
static bool reorder_qw_q6_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(expert_bytes % sizeof(block_q6_K) == 0);
|
||||
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q6_K));
|
||||
const size_t total_bytes = expert_bytes * (size_t) n_expert;
|
||||
|
||||
sycl_reorder_temp_buffer tmp(stream, total_bytes);
|
||||
if (!tmp) {
|
||||
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
|
||||
return false;
|
||||
}
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
const int total_blocks = blocks_per_expert * (int) n_expert;
|
||||
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
|
||||
const int gb = gb_;
|
||||
const int e = gb / blocks_per_expert;
|
||||
const int ib = gb % blocks_per_expert;
|
||||
const block_q6_K * x = (const block_q6_K *) (tmp_buf + (size_t) e * expert_bytes);
|
||||
uint8_t * base = data_device + (size_t) e * expert_bytes;
|
||||
|
||||
auto * ql_ptr = base;
|
||||
auto * qh_ptr = ql_ptr + (QK_K / 2) * blocks_per_expert;
|
||||
auto * scales_ptr = qh_ptr + (QK_K / 4) * blocks_per_expert;
|
||||
auto * d_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * blocks_per_expert);
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
ql_ptr[ib * (QK_K / 2) + j] = x[ib].ql[j];
|
||||
}
|
||||
for (int j = 0; j < QK_K / 4; ++j) {
|
||||
qh_ptr[ib * (QK_K / 4) + j] = x[ib].qh[j];
|
||||
}
|
||||
for (int j = 0; j < QK_K / 16; ++j) {
|
||||
scales_ptr[ib * (QK_K / 16) + j] = x[ib].scales[j];
|
||||
}
|
||||
d_ptr[ib] = x[ib].d;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(size % sizeof(block_q3_K) == 0);
|
||||
GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
|
||||
@@ -3840,6 +3983,22 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
size_t nrows = src0->ne[1];
|
||||
size_t size = ggml_nbytes(src0);
|
||||
|
||||
// MoE expert weights are addressed per expert via nb[2], so each slice must
|
||||
// remain self-contained after reorder.
|
||||
if (src0->ne[2] > 1) {
|
||||
GGML_ASSERT((size_t) size == (size_t) src0->ne[2] * src0->nb[2]);
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_K:
|
||||
return reorder_qw_q4_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
|
||||
case GGML_TYPE_Q5_K:
|
||||
return reorder_qw_q5_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
|
||||
case GGML_TYPE_Q6_K:
|
||||
return reorder_qw_q6_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
||||
@@ -3854,7 +4013,6 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
case GGML_TYPE_Q6_K:
|
||||
return reorder_qw_q6_k(data_device, size, 0, stream);
|
||||
default:
|
||||
GGML_ABORT("reorder_qw() called with unsupported type");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -3902,6 +4060,23 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
|
||||
}
|
||||
}
|
||||
|
||||
// Lazily reorder supported MoE expert weights once their fused path is used.
|
||||
static void opt_for_reorder_id(ggml_backend_sycl_context * ctx, const ggml_tensor * src0) {
|
||||
if (g_ggml_sycl_disable_optimize || !ctx->opt_feature.reorder) {
|
||||
return;
|
||||
}
|
||||
if (src0->type != GGML_TYPE_Q4_K && src0->type != GGML_TYPE_Q5_K && src0->type != GGML_TYPE_Q6_K) {
|
||||
return;
|
||||
}
|
||||
ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
|
||||
if (!extra || extra->optimized_feature.reorder) {
|
||||
return;
|
||||
}
|
||||
if (reorder_qw(src0, ctx->stream())) {
|
||||
extra->optimized_feature.reorder = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
// The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
|
||||
@@ -4067,11 +4242,6 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
|
||||
if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
|
||||
if (!ggml_is_contiguous(src1)) return false;
|
||||
|
||||
// Reorder layout not supported; fall back.
|
||||
const ggml_tensor_extra_gpu * src0_extra =
|
||||
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
|
||||
if (src0_extra && src0_extra->optimized_feature.reorder) return false;
|
||||
|
||||
const int64_t n_ids_per_group = ids->ne[0];
|
||||
if (ids->ne[1] != 1) return false;
|
||||
if (ne11 != 1 && ne11 != n_ids_per_group) return false;
|
||||
@@ -4081,16 +4251,37 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
|
||||
const int n_experts_used = (int) n_ids_per_group;
|
||||
const int nrows = (int) src0->ne[1];
|
||||
|
||||
// Lazily reorder the (Q4_K) expert weights into a per-expert SoA layout, then run the reorder
|
||||
// GEMV. Placed after the bail checks so a non-dispatchable op does not pay the reorder cost.
|
||||
opt_for_reorder_id(&ctx, src0);
|
||||
const ggml_tensor_extra_gpu * src0_extra =
|
||||
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
|
||||
const bool use_reorder = src0_extra && src0_extra->optimized_feature.reorder;
|
||||
|
||||
ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
|
||||
(size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
|
||||
char * src1_ddq = src1_q8_alloc.get();
|
||||
quantize_row_q8_1_sycl<quantize_q8_1>(
|
||||
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
||||
src1_padded_cols, stream);
|
||||
if (use_reorder) {
|
||||
quantize_row_q8_1_sycl<quantize_and_reorder_q8_1_soa>(
|
||||
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
||||
src1_padded_cols, stream);
|
||||
} else {
|
||||
quantize_row_q8_1_sycl<quantize_q8_1>(
|
||||
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
||||
src1_padded_cols, stream);
|
||||
}
|
||||
|
||||
const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
|
||||
const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
|
||||
|
||||
if (use_reorder) {
|
||||
return ggml_sycl_mul_mat_vec_q_id_reorder(
|
||||
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
|
||||
(float *) dst->data, (int) ne10, nrows, n_experts_used,
|
||||
/*expert_weight_stride=*/ src0->nb[2],
|
||||
/*dst_row_stride=*/ dst->nb[1],
|
||||
src1_row_stride, stream);
|
||||
}
|
||||
return ggml_sycl_mul_mat_vec_q_id(
|
||||
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
|
||||
(float *) dst->data, (int) ne10, nrows, n_experts_used,
|
||||
@@ -4489,6 +4680,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_EXP:
|
||||
ggml_sycl_exp(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
ggml_sycl_expm1(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
ggml_sycl_softplus(ctx, dst);
|
||||
break;
|
||||
@@ -5138,6 +5332,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
@@ -5145,11 +5340,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_FLOOR:
|
||||
case GGML_UNARY_OP_ROUND:
|
||||
case GGML_UNARY_OP_TRUNC:
|
||||
#if defined (GGML_SYCL_F16)
|
||||
return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
|
||||
#else
|
||||
return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
|
||||
#endif
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2468,3 +2468,118 @@ bool ggml_sycl_mul_mat_vec_q_id(
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Reorder (SoA) MoE expert GEMV: MoE expert/row/lane indexing (from mul_mat_vec_q_moe) with the
|
||||
// dense-reorder per-block reads (from mul_mat_vec_q_reorder). Each expert slice in vx_base is a
|
||||
// self-contained SoA, so nblocks = nrows*(ncols/qk) per expert and the constant expert stride holds.
|
||||
template <typename reorder_vec_dot_q_sycl>
|
||||
static void mul_mat_vec_q_moe_reorder(
|
||||
const void * __restrict__ vx_base, const void * __restrict__ vy_base,
|
||||
float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
|
||||
const int ncols, const int nrows,
|
||||
const size_t expert_weight_stride, const size_t dst_row_stride,
|
||||
const size_t src1_row_stride,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
using block_type = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
|
||||
using block_traits = typename block_type::traits;
|
||||
|
||||
const int expert_idx = item_ct1.get_group(1);
|
||||
const int i02 = ids_dev[expert_idx];
|
||||
|
||||
const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
|
||||
const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
|
||||
float * dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
|
||||
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto sg = item_ct1.get_sub_group();
|
||||
|
||||
const int blocks_per_row = ncols / block_traits::qk;
|
||||
constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
|
||||
constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
|
||||
const int nblocks = nrows * (ncols / block_traits::qk);
|
||||
|
||||
static_assert(blocks_per_subgroup > 0);
|
||||
static_assert(block_elements_per_subgroup > 0);
|
||||
|
||||
float partial_sum = 0.0f;
|
||||
for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
|
||||
const int ibx = row * blocks_per_row + i;
|
||||
|
||||
const auto bx_offset = block_type::get_block_offset(ibx, nblocks);
|
||||
const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx);
|
||||
|
||||
const int iby = i * block_type::block_to_q8_1_ratio();
|
||||
const int8_t * q8_1_quant_ptr = (const int8_t *) vy + iby * QK8_1;
|
||||
const sycl::half2 * q8_1_ds_ptr = (const sycl::half2 *) ((const char *) vy + ncols + iby * sizeof(sycl::half2));
|
||||
|
||||
#pragma unroll
|
||||
for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
|
||||
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
|
||||
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
|
||||
}
|
||||
}
|
||||
|
||||
auto sum = sycl::reduce_over_group(sg, partial_sum, std::plus<>());
|
||||
if (sg.leader()) {
|
||||
dst[row] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename reorder_vec_dot_q_sycl>
|
||||
static void launch_mul_mat_vec_q_moe_reorder(
|
||||
const void * vx_base, const void * vy, const int32_t * ids_dev,
|
||||
float * dst_base, const int ncols, const int nrows, const int n_experts_used,
|
||||
const size_t expert_weight_stride, const size_t dst_row_stride,
|
||||
const size_t src1_row_stride,
|
||||
dpct::queue_ptr stream) {
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl>(
|
||||
vx_base, vy, dst_base, ids_dev, ncols, nrows,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
bool ggml_sycl_mul_mat_vec_q_id_reorder(
|
||||
enum ggml_type src0_type,
|
||||
const void * vx_base,
|
||||
const void * vy,
|
||||
const int32_t * ids_dev,
|
||||
float * dst_base,
|
||||
int ncols,
|
||||
int nrows,
|
||||
int n_experts_used,
|
||||
size_t expert_weight_stride,
|
||||
size_t dst_row_stride,
|
||||
size_t src1_row_stride,
|
||||
dpct::queue_ptr stream) {
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_Q4_K:
|
||||
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q5_K:
|
||||
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q6_K:
|
||||
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,4 +40,21 @@ bool ggml_sycl_mul_mat_vec_q_id(
|
||||
size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes
|
||||
dpct::queue_ptr stream);
|
||||
|
||||
// Reorder (SoA) variant of the fused MoE expert GEMV.
|
||||
// vx_base: each expert slice (stride expert_weight_stride == src0->nb[2]) is a self-contained reorder/SoA layout.
|
||||
// vy: src1 quantized with quantize_and_reorder_q8_1_soa (per-row SoA). Returns false if src0_type isn't handled.
|
||||
bool ggml_sycl_mul_mat_vec_q_id_reorder(
|
||||
enum ggml_type src0_type,
|
||||
const void * vx_base,
|
||||
const void * vy,
|
||||
const int32_t * ids_dev,
|
||||
float * dst_base,
|
||||
int ncols,
|
||||
int nrows,
|
||||
int n_experts_used,
|
||||
size_t expert_weight_stride,
|
||||
size_t dst_row_stride,
|
||||
size_t src1_row_stride,
|
||||
dpct::queue_ptr stream);
|
||||
|
||||
#endif // GGML_SYCL_MMVQ_HPP
|
||||
|
||||
@@ -902,14 +902,17 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_conv_transpose_1d_f32;
|
||||
vk_pipeline pipeline_col2im_1d_f32;
|
||||
vk_pipeline pipeline_col2im_1d_f16;
|
||||
vk_pipeline pipeline_col2im_1d_bf16;
|
||||
vk_pipeline pipeline_snake_f32;
|
||||
vk_pipeline pipeline_snake_f16;
|
||||
vk_pipeline pipeline_snake_bf16;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
// [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128
|
||||
vk_pipeline pipeline_gated_delta_net[3][2];
|
||||
// [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128
|
||||
vk_pipeline pipeline_gated_delta_net[4][2];
|
||||
vk_pipeline pipeline_ssm_scan_f32_d128;
|
||||
vk_pipeline pipeline_ssm_scan_f32_d256;
|
||||
vk_pipeline pipeline_ssm_conv_f32;
|
||||
@@ -1552,6 +1555,16 @@ struct vk_op_timestep_embedding_push_constants {
|
||||
uint32_t max_period;
|
||||
};
|
||||
|
||||
struct vk_op_col2im_1d_push_constants {
|
||||
uint32_t T_out;
|
||||
uint32_t OC;
|
||||
uint32_t K_OC;
|
||||
uint32_t T_in;
|
||||
uint32_t K;
|
||||
int32_t stride;
|
||||
int32_t p0;
|
||||
};
|
||||
|
||||
struct vk_op_conv_transpose_1d_push_constants {
|
||||
uint32_t Cout;
|
||||
uint32_t Cin;
|
||||
@@ -3067,8 +3080,10 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
|
||||
vk::MemoryPropertyFlagBits::eDeviceLocal});
|
||||
} else if (device->uma) {
|
||||
// Fall back to host memory type
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
|
||||
// On UMA, prefer host-visible memory so direct tensor borrowing works.
|
||||
// If unavailable, fall back to device-local memory.
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
|
||||
vk::MemoryPropertyFlagBits::eDeviceLocal,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
|
||||
} else if (device->disable_host_visible_vidmem) {
|
||||
if (device->allow_sysmem_fallback) {
|
||||
@@ -5203,6 +5218,9 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f32, "col2im_1d_f32", col2im_1d_f32_len, col2im_1d_f32_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f16, "col2im_1d_f16", col2im_1d_f16_len, col2im_1d_f16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_bf16, "col2im_1d_bf16", col2im_1d_bf16_len, col2im_1d_bf16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
@@ -5215,14 +5233,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
|
||||
{
|
||||
const uint32_t gdn_sizes[] = {32, 64, 128};
|
||||
const uint32_t gdn_sizes[] = {16, 32, 64, 128};
|
||||
const char * gdn_names[][2] = {
|
||||
{"gated_delta_net_f32_d16", "gated_delta_net_f32_d16_kda"},
|
||||
{"gated_delta_net_f32_d32", "gated_delta_net_f32_d32_kda"},
|
||||
{"gated_delta_net_f32_d64", "gated_delta_net_f32_d64_kda"},
|
||||
{"gated_delta_net_f32_d128", "gated_delta_net_f32_d128_kda"},
|
||||
};
|
||||
const bool use_subgroup_reduce = device->subgroup_arithmetic;
|
||||
for (uint32_t si = 0; si < 3; si++) {
|
||||
for (uint32_t si = 0; si < 4; si++) {
|
||||
const uint32_t S_V = gdn_sizes[si];
|
||||
GGML_ASSERT(is_pow2(S_V));
|
||||
|
||||
@@ -5236,10 +5254,29 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
lanes_per_column = std::min(S_V, device->subgroup_size);
|
||||
}
|
||||
|
||||
const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size);
|
||||
// gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and
|
||||
// S_V % LANES_PER_COLUMN == 0 to avoid bounds checks.
|
||||
while (lanes_per_column > 1u) {
|
||||
const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 &&
|
||||
(S_V % lanes_per_column) == 0;
|
||||
const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0;
|
||||
if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) {
|
||||
break;
|
||||
}
|
||||
lanes_per_column >>= 1u;
|
||||
}
|
||||
|
||||
GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0);
|
||||
GGML_ASSERT((S_V % lanes_per_column) == 0);
|
||||
GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0);
|
||||
|
||||
const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size;
|
||||
const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce;
|
||||
const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce;
|
||||
const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce;
|
||||
size_t gdn_len;
|
||||
const void * gdn_data;
|
||||
if (use_subgroup_reduce && need_clustered_shader) {
|
||||
if (use_clustered_reduce) {
|
||||
gdn_len = gated_delta_net_f32_len;
|
||||
gdn_data = (const void *)gated_delta_net_f32_data;
|
||||
} else if (use_subgroup_reduce) {
|
||||
@@ -5256,7 +5293,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
for (uint32_t kda = 0; kda < 2; kda++) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda],
|
||||
gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants),
|
||||
wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size);
|
||||
wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10702,6 +10739,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_conv_transpose_1d_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: return ctx->device->pipeline_col2im_1d_f32;
|
||||
case GGML_TYPE_F16: return ctx->device->pipeline_col2im_1d_f16;
|
||||
case GGML_TYPE_BF16: return ctx->device->pipeline_col2im_1d_bf16;
|
||||
default: return nullptr;
|
||||
}
|
||||
case GGML_OP_POOL_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_pool2d_f32;
|
||||
@@ -10723,9 +10767,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0;
|
||||
uint32_t si;
|
||||
switch (S_v) {
|
||||
case 32: si = 0; break;
|
||||
case 64: si = 1; break;
|
||||
case 128: si = 2; break;
|
||||
case 16: si = 0; break;
|
||||
case 32: si = 1; break;
|
||||
case 64: si = 2; break;
|
||||
case 128: si = 3; break;
|
||||
default: return nullptr;
|
||||
}
|
||||
return ctx->device->pipeline_gated_delta_net[si][kda];
|
||||
@@ -11147,6 +11192,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
{
|
||||
elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
|
||||
} break;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
{
|
||||
elements = { uint32_t(dst->ne[0]), uint32_t(dst->ne[1]), 1 };
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
const uint32_t N = dst->ne[3];
|
||||
@@ -12936,6 +12985,32 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
// src0: [K_OC, T_in] columns from matmul
|
||||
// dst: [T_out, OC]
|
||||
|
||||
const int32_t stride = dst->op_params[0];
|
||||
const int32_t oc = dst->op_params[1];
|
||||
const int32_t p0 = dst->op_params[2];
|
||||
|
||||
const uint32_t K_OC = static_cast<uint32_t>(src0->ne[0]);
|
||||
const uint32_t T_in = static_cast<uint32_t>(src0->ne[1]);
|
||||
const uint32_t T_out = static_cast<uint32_t>(dst->ne[0]);
|
||||
const uint32_t OC = static_cast<uint32_t>(oc);
|
||||
const uint32_t K = K_OC / OC;
|
||||
|
||||
vk_op_col2im_1d_push_constants p{};
|
||||
p.T_out = T_out;
|
||||
p.OC = OC;
|
||||
p.K_OC = K_OC;
|
||||
p.T_in = T_in;
|
||||
p.K = K;
|
||||
p.stride = stride;
|
||||
p.p0 = p0;
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COL2IM_1D, std::move(p));
|
||||
}
|
||||
|
||||
// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
|
||||
// Match the naive mul -> sin -> sqr -> mul -> add chain and run the
|
||||
// dedicated kernel directly. The pattern is validated by
|
||||
@@ -14423,6 +14498,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
ggml_vk_col2im_1d(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node);
|
||||
@@ -17136,7 +17215,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
{
|
||||
const uint32_t S_v = op->src[2]->ne[0];
|
||||
if (S_v != 32 && S_v != 64 && S_v != 128) {
|
||||
if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < 6; i++) {
|
||||
@@ -17188,6 +17267,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 ||
|
||||
op->src[0]->type == GGML_TYPE_F16 ||
|
||||
op->src[0]->type == GGML_TYPE_BF16) &&
|
||||
op->type == op->src[0]->type &&
|
||||
ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op);
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
@@ -18019,6 +18105,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
const int32_t p0 = tensor->op_params[1];
|
||||
const int32_t d0 = tensor->op_params[2];
|
||||
tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
|
||||
} else if (tensor->op == GGML_OP_COL2IM_1D) {
|
||||
const int32_t stride = tensor->op_params[0];
|
||||
const int32_t oc = tensor->op_params[1];
|
||||
const int32_t p0 = tensor->op_params[2];
|
||||
tensor_clone = ggml_col2im_1d(ggml_ctx, src_clone[0], stride, oc, p0);
|
||||
} else if (tensor->op == GGML_OP_POOL_2D) {
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
|
||||
const int32_t k0 = tensor->op_params[1];
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // columns: [K_OC, T_in]
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; // output: [T_out, OC]
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint32_t T_out;
|
||||
uint32_t OC;
|
||||
uint32_t K_OC;
|
||||
uint32_t T_in;
|
||||
uint32_t K;
|
||||
int32_t stride;
|
||||
int32_t p0;
|
||||
} p;
|
||||
|
||||
// Load A_TYPE to float
|
||||
float load_col(uint32_t idx) {
|
||||
#if defined(DATA_A_BF16)
|
||||
return bf16_to_fp32(uint32_t(data_a[idx]));
|
||||
#else
|
||||
return float(data_a[idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Store float as D_TYPE
|
||||
void store_dst(uint32_t idx, float v) {
|
||||
#if defined(DATA_A_BF16)
|
||||
data_d[idx] = D_TYPE(fp32_to_bf16(v));
|
||||
#else
|
||||
data_d[idx] = D_TYPE(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint32_t t_out = gl_GlobalInvocationID.x;
|
||||
const uint32_t oc = gl_GlobalInvocationID.y;
|
||||
if (t_out >= p.T_out || oc >= p.OC) return;
|
||||
|
||||
const int32_t t_abs = int32_t(t_out) + p.p0; // absolute position in uncropped signal
|
||||
|
||||
// Gather: only the ceil(K/stride) columns that scatter into t_abs, no modulo
|
||||
int32_t t_in_min = (t_abs - int32_t(p.K) + p.stride) / p.stride;
|
||||
if (t_in_min < 0) t_in_min = 0;
|
||||
int32_t t_in_max = t_abs / p.stride;
|
||||
if (t_in_max >= int32_t(p.T_in)) t_in_max = int32_t(p.T_in) - 1;
|
||||
|
||||
float val = 0.0;
|
||||
for (int32_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
|
||||
int32_t k = t_abs - t_in * p.stride;
|
||||
// col layout: [K_OC, T_in], column index = oc * K + k
|
||||
uint32_t col_idx = (oc * p.K + uint32_t(k)) + uint32_t(t_in) * p.K_OC;
|
||||
val += load_col(col_idx);
|
||||
}
|
||||
|
||||
// dst layout: [T_out, OC], element (t_out, oc) = t_out + oc * T_out
|
||||
store_dst(t_out + oc * p.T_out, val);
|
||||
}
|
||||
@@ -1003,6 +1003,9 @@ void process_shaders() {
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("col2im_1d_f32", "col2im_1d.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("col2im_1d_f16", "col2im_1d.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("col2im_1d_bf16", "col2im_1d.comp", {{"DATA_A_BF16", "1"}, {"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
|
||||
|
||||
string_to_spv("snake_f32", "snake.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("snake_f16", "snake.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
|
||||
@@ -129,7 +129,7 @@ void test_gbnf_generation(testing &t) {
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
|
||||
root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])* ("<" | "</" | "</t" | "</ta" | "</tag")?
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
+75
-2
@@ -1882,11 +1882,29 @@ static void test_lfm2_parser(const std::string & template_path, bool detailed_de
|
||||
.expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
|
||||
.run();
|
||||
|
||||
// Python tool with multiline code in string
|
||||
// Python tool with multiline code in string: the \n in the literal decodes to a real
|
||||
// newline, emitted as a JSON \n escape (not a doubled backslash).
|
||||
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
|
||||
{ "python", R"#({"code": "def hello():\n print('hey')"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
// String escape sequences decode to their actual characters (newline + tab here),
|
||||
// so a "write a two line file" style call produces real line breaks, not literal "\n".
|
||||
tst.test("<|tool_call_start|>[python(code=\"First line\\nSecond line\\tindented\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "First line\nSecond line\tindented"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
// Escaped quotes inside a string argument survive the round-trip.
|
||||
tst.test("<|tool_call_start|>[python(code=\"print(\\\"hi\\\")\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "print(\"hi\")"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
@@ -2024,6 +2042,61 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=edit>\n"
|
||||
"<parameter=filename>\n"
|
||||
"foo.c\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=oldString>\n"
|
||||
"#iclunde\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=newString>\n"
|
||||
"#include\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
edit_tool
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\"}", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// a parameter value that itself ends in a newline (e.g. a source file with a
|
||||
// trailing newline). The structural delimiter is "\n</parameter>\n", so the value
|
||||
// "#include\n" renders as "...#include\n\n</parameter>\n". The trailing newline must
|
||||
// be preserved faithfully (no stripping), and the generated grammar must admit a
|
||||
// value ending on a delimiter prefix. Regression test for gbnf_excluding_pattern.
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=edit>\n"
|
||||
"<parameter=filename>\n"
|
||||
"foo.c\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=oldString>\n"
|
||||
"#iclunde\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=newString>\n"
|
||||
"#include\n"
|
||||
"\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
edit_tool
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\\n\"}", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
|
||||
// test code that starts with indent
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
|
||||
@@ -323,6 +323,7 @@ struct cmd_params {
|
||||
std::vector<std::string> hf_repo;
|
||||
std::vector<std::string> hf_file;
|
||||
std::string hf_token;
|
||||
bool offline;
|
||||
std::vector<int> n_prompt;
|
||||
std::vector<int> n_gen;
|
||||
std::vector<std::pair<int, int>> n_pg;
|
||||
@@ -367,6 +368,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* hf_repo */ {},
|
||||
/* hf_file */ {},
|
||||
/* hf_token */ "",
|
||||
/* offline */ false,
|
||||
/* n_prompt */ { 512 },
|
||||
/* n_gen */ { 128 },
|
||||
/* n_pg */ {},
|
||||
@@ -437,6 +439,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" (default: unused)\n");
|
||||
printf(" -hft, --hf-token <token> Hugging Face access token\n");
|
||||
printf(" (default: value from HF_TOKEN environment variable)\n");
|
||||
printf(" --offline Offline mode: forces use of cache, prevents network access\n");
|
||||
printf(" (default: disabled)\n");
|
||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||
@@ -558,6 +562,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.hf_token = argv[i];
|
||||
} else if (arg == "--offline") {
|
||||
params.offline = true;
|
||||
} else if (arg == "-p" || arg == "--n-prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -1040,6 +1046,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
|
||||
common_download_opts opts;
|
||||
opts.bearer_token = params.hf_token;
|
||||
opts.offline = params.offline;
|
||||
auto download_result = common_download_model(model, opts);
|
||||
if (download_result.model_path.empty()) {
|
||||
fprintf(stderr, "error: failed to download model from HuggingFace\n");
|
||||
|
||||
@@ -247,7 +247,9 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
llama_pos * new_n_past,
|
||||
mtmd_helper_post_decode_callback callback,
|
||||
void * user_data) {
|
||||
GGML_ASSERT(n_batch > 0);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
@@ -302,10 +304,23 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
int32_t ret = llama_decode(lctx, batch_embd_view);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode %s\n", name);
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
if (use_non_causal) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (callback != nullptr) {
|
||||
ret = callback(batch_embd_view, user_data);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("post-decode callback failed\n");
|
||||
if (use_non_causal) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
||||
|
||||
i_batch++;
|
||||
@@ -379,7 +394,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
|
||||
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
|
||||
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past, nullptr, nullptr);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode %s\n", name);
|
||||
llama_batch_free(text_batch);
|
||||
|
||||
@@ -91,6 +91,8 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
typedef int32_t (*mtmd_helper_post_decode_callback)(struct llama_batch batch, void * user_data);
|
||||
|
||||
// helper function to decode an image whose embeddings have already been calculated
|
||||
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
||||
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
||||
@@ -101,7 +103,9 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past);
|
||||
llama_pos * new_n_past,
|
||||
mtmd_helper_post_decode_callback callback,
|
||||
void * user_data);
|
||||
|
||||
//
|
||||
// video input helpers (requires ffmpeg/ffprobe installed on the system)
|
||||
|
||||
+8
-9
@@ -96,16 +96,15 @@ struct mtmd_image_tokens {
|
||||
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
|
||||
return (nx + 1) * ny + 2;
|
||||
}
|
||||
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
|
||||
if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
|
||||
return nx * ny;
|
||||
}
|
||||
uint32_t nz = batch_f32.entries.size();
|
||||
// TODO: simplify this by repeating the last frame until it fits the temporal merge
|
||||
if (nz % n_temporal_merge != 0) {
|
||||
nz = nz / n_temporal_merge + 1;
|
||||
} else {
|
||||
nz = nz / n_temporal_merge;
|
||||
if (n_temporal_merge > 1) {
|
||||
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
|
||||
// TODO: simplify this by repeating the last frame until it fits the temporal merge
|
||||
if (nz % n_temporal_merge != 0) {
|
||||
nz = nz / n_temporal_merge + 1;
|
||||
} else {
|
||||
nz = nz / n_temporal_merge;
|
||||
}
|
||||
}
|
||||
return nx * ny * nz;
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
required=True)
|
||||
parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
|
||||
parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
|
||||
parser.add_argument("--offline", action="store_true", default=False, help="Offline mode: forces use of cache, prevents network access")
|
||||
parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
|
||||
parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
|
||||
parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
|
||||
@@ -268,6 +269,8 @@ def start_server_background(args):
|
||||
]
|
||||
server_args.extend(['--hf-repo', args.hf_repo])
|
||||
server_args.extend(['--hf-file', args.hf_file])
|
||||
if args.offline:
|
||||
server_args.append('--offline')
|
||||
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
|
||||
server_args.extend(['--ctx-size', args.ctx_size])
|
||||
server_args.extend(['--parallel', args.parallel])
|
||||
|
||||
@@ -539,37 +539,6 @@ bool server_tokens::validate(const struct llama_context * ctx) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t server_tokens::process_chunk(
|
||||
llama_context * ctx,
|
||||
mtmd_context * mctx,
|
||||
size_t idx,
|
||||
llama_pos pos,
|
||||
int32_t seq_id,
|
||||
size_t & n_tokens_out) const {
|
||||
const auto & chunk = find_chunk(idx);
|
||||
const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
|
||||
? "image" : "audio";
|
||||
SRV_INF("processing %s...\n", name);
|
||||
int32_t n_batch = llama_n_batch(ctx);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
llama_pos new_n_past; // unused for now
|
||||
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
|
||||
chunk.get(),
|
||||
pos,
|
||||
seq_id,
|
||||
n_batch,
|
||||
true, // logits last
|
||||
&new_n_past);
|
||||
SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
|
||||
if (result != 0) {
|
||||
LOG_ERR("mtmd_helper_eval failed with status %d", result);
|
||||
n_tokens_out = 0;
|
||||
return result;
|
||||
}
|
||||
n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
|
||||
return 0;
|
||||
}
|
||||
|
||||
server_tokens server_tokens::clone() const {
|
||||
server_tokens res;
|
||||
res.has_mtmd = has_mtmd;
|
||||
|
||||
@@ -221,15 +221,6 @@ public:
|
||||
// make sure all text tokens are within the vocab range
|
||||
bool validate(const struct llama_context * ctx) const;
|
||||
|
||||
// encode and decode the image chunk
|
||||
int32_t process_chunk(
|
||||
llama_context * ctx,
|
||||
mtmd_context * mctx,
|
||||
size_t idx,
|
||||
llama_pos pos,
|
||||
int32_t seq_id,
|
||||
size_t & n_tokens_out) const;
|
||||
|
||||
server_tokens clone() const;
|
||||
};
|
||||
|
||||
|
||||
@@ -15,11 +15,6 @@
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING]
|
||||
#include "../../src/llama-ext.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cinttypes>
|
||||
@@ -81,7 +76,6 @@ struct server_slot {
|
||||
// multimodal
|
||||
mtmd_context * mctx = nullptr;
|
||||
mtmd::batch_ptr mbatch = nullptr;
|
||||
std::array<llama_context *, 2> mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context
|
||||
|
||||
// speculative decoding
|
||||
common_speculative * spec;
|
||||
@@ -207,6 +201,8 @@ struct server_slot {
|
||||
// Speculative decoding stats
|
||||
int32_t n_draft_total = 0; // Total draft tokens generated
|
||||
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
|
||||
int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model
|
||||
std::vector<int32_t> n_accepted_per_pos; // Accepted tokens per draft position
|
||||
|
||||
void reset() {
|
||||
SLT_DBG(*this, "%s", "\n");
|
||||
@@ -233,6 +229,8 @@ struct server_slot {
|
||||
// clear speculative decoding stats
|
||||
n_draft_total = 0;
|
||||
n_draft_accepted = 0;
|
||||
n_draft_verif_steps = 0;
|
||||
n_accepted_per_pos.clear();
|
||||
|
||||
task_prev = std::move(task);
|
||||
task.reset();
|
||||
@@ -244,15 +242,6 @@ struct server_slot {
|
||||
|
||||
// clear multimodal state
|
||||
mbatch.reset();
|
||||
mtgt[0] = ctx_tgt;
|
||||
mtgt[1] = nullptr;
|
||||
if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) {
|
||||
// TODO: in the future, figure out how to infuse target embeddings to the images
|
||||
// for now, we re-decode the same chunk in both ctx_tgt and ctx_dft
|
||||
// maybe we simply need to call `common_speculative_process()` ?
|
||||
// [TAG_MTMD_DRAFT_PROCESSING]
|
||||
mtgt[1] = ctx_dft;
|
||||
}
|
||||
}
|
||||
|
||||
void init_sampler() const {
|
||||
@@ -524,10 +513,22 @@ struct server_slot {
|
||||
llama_perf_context(ctx_tgt).n_reused);
|
||||
|
||||
if (n_draft_total > 0) {
|
||||
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
||||
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
||||
const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0;
|
||||
|
||||
std::string acceptance_rates_per_pos;
|
||||
if (n_draft_verif_steps > 0) {
|
||||
for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) {
|
||||
if (i > 0) {
|
||||
acceptance_rates_per_pos += ", ";
|
||||
}
|
||||
acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps);
|
||||
}
|
||||
}
|
||||
|
||||
SLT_INF(*this,
|
||||
"draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
|
||||
draft_ratio, n_draft_accepted, n_draft_total);
|
||||
"draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n",
|
||||
draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str());
|
||||
}
|
||||
|
||||
common_speculative_print_stats(spec);
|
||||
@@ -598,32 +599,38 @@ struct server_slot {
|
||||
int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) {
|
||||
GGML_ASSERT(mctx);
|
||||
const auto & input_tokens = task->tokens;
|
||||
auto & chunk = input_tokens.find_chunk(idx);
|
||||
const auto & chunk = input_tokens.find_chunk(idx);
|
||||
int32_t res = 0;
|
||||
|
||||
auto try_decode = [&]() -> int32_t {
|
||||
if (mbatch) {
|
||||
float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get());
|
||||
if (embd) {
|
||||
for (auto * lctx : mtgt) {
|
||||
if (lctx == nullptr) {
|
||||
continue;
|
||||
}
|
||||
llama_pos new_n_past; // unused for now
|
||||
res = mtmd_helper_decode_image_chunk(
|
||||
mctx,
|
||||
lctx,
|
||||
chunk.get(),
|
||||
embd,
|
||||
prompt.tokens.pos_next(),
|
||||
id,
|
||||
llama_n_batch(lctx),
|
||||
&new_n_past
|
||||
);
|
||||
if (res != 0) {
|
||||
SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
|
||||
return -1;
|
||||
void * cb_data = spec;
|
||||
static auto cb = [](llama_batch batch, void * user_data) {
|
||||
common_speculative * spec = static_cast<common_speculative *>(user_data);
|
||||
if (!common_speculative_process(spec, batch)) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
llama_pos new_n_past; // unused for now
|
||||
res = mtmd_helper_decode_image_chunk(
|
||||
mctx,
|
||||
ctx_tgt,
|
||||
chunk.get(),
|
||||
embd,
|
||||
prompt.tokens.pos_next(),
|
||||
id,
|
||||
llama_n_batch(ctx_tgt),
|
||||
&new_n_past,
|
||||
cb,
|
||||
cb_data
|
||||
);
|
||||
if (res != 0) {
|
||||
SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
|
||||
return -1;
|
||||
}
|
||||
n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
|
||||
return 0; // success
|
||||
@@ -636,7 +643,8 @@ struct server_slot {
|
||||
res = try_decode();
|
||||
if (res == 0) {
|
||||
return 0;
|
||||
} else if (res < 0) {
|
||||
}
|
||||
if (res < 0) {
|
||||
// fatal error
|
||||
return res;
|
||||
}
|
||||
@@ -3350,48 +3358,6 @@ private:
|
||||
// TODO: avoid restoring the draft context and re-evaluating the drafted tokens when not needed [TAG_SPEC_AVOID_DRAFT_REEVAL]
|
||||
// for now, always re-evaluate for simplicity
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4400925384
|
||||
//
|
||||
// | spec type | need re-eval |
|
||||
// | --- | --- |
|
||||
// | draft model | no | because the draft model does not use embeddings from the target
|
||||
// | MTP (std) | yes |
|
||||
// | MTP Gemma4 | no | because the KV cache is shared
|
||||
// | Eagle3 | yes |
|
||||
// | DFlash | yes | https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4405406982
|
||||
//
|
||||
// note: this logic is now moved in `common_speculative_process()`
|
||||
// keeping the sketch here until for a bit, until the logic is finalized
|
||||
//
|
||||
//if (ctx_dft) {
|
||||
// // TODO: update as needed for MTP, Eagle3, etc.
|
||||
// const bool need_tgt_embd = false;
|
||||
|
||||
// if (need_tgt_embd) {
|
||||
// llama_synchronize(ctx_tgt);
|
||||
// }
|
||||
|
||||
// // the logic here varies depending on the speculative decoding method
|
||||
// // - some draft contexts require embeddings from the target context, others don't
|
||||
// // - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
|
||||
// // TODO: extract this in a function ?
|
||||
// {
|
||||
// // TODO: hook the embeddings from the last target batch here
|
||||
// if (llama_model_has_encoder(model_dft.get())) {
|
||||
// //llama_encode(ctx_dft, ...);
|
||||
|
||||
// GGML_ABORT("not implemented yet\n");
|
||||
// }
|
||||
|
||||
// const int ret = llama_decode(ctx_dft.get(), batch_view);
|
||||
|
||||
// if (ret != 0) {
|
||||
// SRV_ERR("failed to decode draft batch, ret = %d\n", ret);
|
||||
|
||||
// // TODO: handle error
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
if (!common_speculative_process(spec.get(), batch_view)) {
|
||||
SRV_ERR("%s", "failed to process speculative batch\n");
|
||||
|
||||
@@ -3593,6 +3559,14 @@ private:
|
||||
|
||||
// update how many tokens out of those tested were accepted
|
||||
slot.n_draft_accepted += ids.size() - 1;
|
||||
slot.n_draft_verif_steps += 1;
|
||||
|
||||
if (slot.n_accepted_per_pos.empty()) {
|
||||
slot.n_accepted_per_pos.resize(common_speculative_n_max(¶ms_base.speculative), 0);
|
||||
}
|
||||
for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) {
|
||||
slot.n_accepted_per_pos[i]++;
|
||||
}
|
||||
|
||||
// add accepted tokens to the prompt
|
||||
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
|
||||
|
||||
Reference in New Issue
Block a user