mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 02:36:43 +02:00
Compare commits
18 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7dad2f1a17 | |||
| e36a602ba3 | |||
| 38d546330a | |||
| a1eb756c0b | |||
| 581e8eca8b | |||
| 0ae3f450f0 | |||
| e3cab403bf | |||
| 9dbc6621ae | |||
| 6eab47181c | |||
| e3bb1add8c | |||
| d8a3f523c8 | |||
| 72be44f1d2 | |||
| 8872ab5467 | |||
| 987fbd821d | |||
| c035ff4902 | |||
| 272088b9f2 | |||
| a6dff71270 | |||
| 2a6c391a5e |
@@ -37,7 +37,7 @@ LLM inference in C/C++
|
||||
|
||||
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
|
||||
|
||||
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
|
||||
- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md)
|
||||
- Run with Docker - see our [Docker documentation](docs/docker.md)
|
||||
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
|
||||
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
|
||||
|
||||
@@ -103,6 +103,10 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
|
||||
data.grammar_triggers = {
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
|
||||
};
|
||||
if (autoparser.tools.format.openai_wrapper_trigger) {
|
||||
// model emits the OpenAI function wrapper, trigger on it
|
||||
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "{\"type\": \"function\"," });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -134,7 +138,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs, cons
|
||||
auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
|
||||
parser = ctx.reasoning_parser + p.space() + p.choice({
|
||||
p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
|
||||
response_format
|
||||
p.space() + response_format + p.space()
|
||||
}) + p.end();
|
||||
pure_content = false;
|
||||
} else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
|
||||
@@ -224,13 +228,13 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
||||
auto single_tool_parser = p.standard_json_tools(
|
||||
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
|
||||
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
|
||||
} else {
|
||||
tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
|
||||
}
|
||||
|
||||
// Handle content wrappers if present
|
||||
@@ -393,8 +397,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
(schema_info.resolves_to_string(param_schema) ?
|
||||
p.tool_arg_string_value(until_suffix) :
|
||||
p.tool_arg_json_value(p.schema(
|
||||
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
|
||||
p.space()) +
|
||||
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false))) +
|
||||
p.tool_arg_close(p.literal(arguments.value_suffix)));
|
||||
|
||||
auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
|
||||
|
||||
@@ -181,6 +181,7 @@ struct tool_format_analysis {
|
||||
|
||||
bool fun_name_is_key = false; // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
|
||||
bool tools_array_wrapped = false; // Tool calls wrapped in JSON array [...]
|
||||
bool openai_wrapper_trigger = false; // model emits the OpenAI function wrapper, trigger on it
|
||||
|
||||
std::string function_field = "function";
|
||||
std::string name_field = "name";
|
||||
|
||||
@@ -165,6 +165,14 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// template uses the JSON {name, parameters} tool instruction, emits the OpenAI function wrapper
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("Respond in the format {\"name\": function name") != std::string::npos &&
|
||||
tmpl.src.find("Do not use variables.") != std::string::npos) {
|
||||
analysis.tools.format.openai_wrapper_trigger = true;
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: JSON name/parameters tool instruction]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
|
||||
});
|
||||
|
||||
@@ -1229,8 +1237,8 @@ void analyze_tools::extract_argument_name_markers() {
|
||||
left_result.tags["pre"] == right_result.tags["pre"] &&
|
||||
left_result.tags["suffix"] == right_result.tags["suffix"]) {
|
||||
// Name is inside a structure (e.g., JSON key): prefix is the shared wrapper
|
||||
arguments.name_prefix = trim_whitespace(left_result.tags["pre"]);
|
||||
arguments.name_suffix = trim_leading_whitespace(left_result.tags["suffix"]);
|
||||
arguments.name_prefix = left_result.tags["pre"];
|
||||
arguments.name_suffix = left_result.tags["suffix"];
|
||||
} else if (diff.left.substr(0, ARG_FIRST.length()) == ARG_FIRST && diff.right.substr(0, ARG_SECOND.length()) == ARG_SECOND) {
|
||||
// Name is directly in the diff: prefix comes from last marker in diff.prefix
|
||||
auto pre_parser = build_tagged_peg_parser([&](common_peg_parser_builder & p) {
|
||||
@@ -1315,8 +1323,7 @@ void analyze_tools::extract_argument_value_markers() {
|
||||
value_suffix = value_suffix.substr(0, end_marker_pos);
|
||||
}
|
||||
}
|
||||
value_suffix = trim_leading_whitespace(value_suffix);
|
||||
if (!value_suffix.empty()) {
|
||||
if (!trim_whitespace(value_suffix).empty()) {
|
||||
arguments.value_suffix = value_suffix;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -363,7 +363,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
|
||||
}
|
||||
|
||||
if ((is_arg_value || is_arg_string_value) && current_tool) {
|
||||
std::string value_content = std::string(trim_trailing_space(trim_leading_space(node.text, 1), 1));
|
||||
std::string value_content = std::string(node.text);
|
||||
|
||||
std::string value_to_add;
|
||||
if (value_content.empty() && is_arg_string_value) {
|
||||
@@ -540,10 +540,11 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
||||
auto arg_name_parser = literal(prop_name);
|
||||
|
||||
common_peg_parser arg_value_parser = eps();
|
||||
auto string_value_parser = choice({
|
||||
literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
|
||||
literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
|
||||
});
|
||||
// Quoted literal as a value: normalize_quotes_to_json preserves escapes.
|
||||
auto string_value_parser = tool_arg_value(choice({
|
||||
literal("\"") + string_content('"') + literal("\""),
|
||||
literal("'") + string_content('\'') + literal("'")
|
||||
}));
|
||||
|
||||
if (is_string_type) {
|
||||
arg_value_parser = string_value_parser;
|
||||
@@ -745,7 +746,8 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||
const std::string & effective_args_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key,
|
||||
const std::vector<std::string> & parameters_order) {
|
||||
const std::vector<std::string> & parameters_order,
|
||||
bool accept_openai_wrapper) {
|
||||
|
||||
auto tool_choices = choice();
|
||||
auto name_key_parser = literal("\"" + effective_name_key + "\"");
|
||||
@@ -807,7 +809,13 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||
return idx_a < idx_b;
|
||||
});
|
||||
|
||||
auto ordered_body = tool_open(literal("{")) + space();
|
||||
// accept an optional leading "type": "function" field when the model emits the OpenAI wrapper
|
||||
common_peg_parser type_field = eps();
|
||||
if (accept_openai_wrapper) {
|
||||
type_field = optional(literal("\"type\"") + space() + literal(":") + space() +
|
||||
literal("\"function\"") + space() + literal(",") + space());
|
||||
}
|
||||
auto ordered_body = tool_open(literal("{")) + space() + type_field;
|
||||
for (size_t i = 0; i < parser_pairs.size(); i++) {
|
||||
ordered_body = ordered_body + parser_pairs[i].first;
|
||||
if (i < parser_pairs.size() - 1) {
|
||||
@@ -870,7 +878,8 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
|
||||
bool function_is_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key,
|
||||
const std::vector<std::string> & parameters_order) {
|
||||
const std::vector<std::string> & parameters_order,
|
||||
bool accept_openai_wrapper) {
|
||||
if (!tools.is_array() || tools.empty()) {
|
||||
return eps();
|
||||
}
|
||||
@@ -888,7 +897,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
|
||||
if (!name_spec.first.empty() || !args_spec.first.empty()) {
|
||||
tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
|
||||
} else {
|
||||
tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
|
||||
tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order, accept_openai_wrapper);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -120,7 +120,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
bool function_is_key = false,
|
||||
const std::string & call_id_key = "",
|
||||
const std::string & gen_call_id_key = "",
|
||||
const std::vector<std::string> & parameters_order = {});
|
||||
const std::vector<std::string> & parameters_order = {},
|
||||
bool accept_openai_wrapper = false);
|
||||
|
||||
// Legacy-compatible helper for building XML/tagged style tool calls
|
||||
// Used by tests and manual parsers
|
||||
@@ -157,7 +158,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
const std::string & effective_args_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key,
|
||||
const std::vector<std::string> & parameters_order);
|
||||
const std::vector<std::string> & parameters_order,
|
||||
bool accept_openai_wrapper);
|
||||
};
|
||||
|
||||
inline common_peg_arena build_chat_peg_parser(
|
||||
|
||||
+3
-2
@@ -2678,8 +2678,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
|
||||
effective_input.substr(result.end));
|
||||
LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str());
|
||||
LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str());
|
||||
throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format");
|
||||
}
|
||||
|
||||
common_chat_msg msg;
|
||||
|
||||
+32
-17
@@ -1272,13 +1272,13 @@ common_peg_parser common_peg_parser_builder::string_content(char delimiter) {
|
||||
|
||||
common_peg_parser common_peg_parser_builder::double_quoted_string() {
|
||||
return rule("double-quoted-string", [this]() {
|
||||
return sequence({literal("\""), string_content('"'), literal("\""), space()});
|
||||
return sequence({literal("\""), string_content('"'), literal("\"")});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser common_peg_parser_builder::single_quoted_string() {
|
||||
return rule("single-quoted-string", [this]() {
|
||||
return sequence({literal("'"), string_content('\''), literal("'"), space()});
|
||||
return sequence({literal("'"), string_content('\''), literal("'")});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1301,25 +1301,25 @@ common_peg_parser common_peg_parser_builder::json_number() {
|
||||
// At EOF in partial mode, chars returns NEED_MORE → negate propagates NEED_MORE → number not committed.
|
||||
// This prevents premature commits of partial numbers (e.g. "3" when "3.14" is incoming).
|
||||
auto not_number_continuation = negate(chars("[0-9.eE+-]", 1, 1));
|
||||
return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation, space() });
|
||||
return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation });
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser common_peg_parser_builder::json_string() {
|
||||
return rule("json-string", [this]() {
|
||||
return sequence({literal("\""), string_content('"'), literal("\""), space()});
|
||||
return sequence({literal("\""), string_content('"'), literal("\"")});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser common_peg_parser_builder::json_bool() {
|
||||
return rule("json-bool", [this]() {
|
||||
return sequence({choice({literal("true"), literal("false")}), space()});
|
||||
return choice({literal("true"), literal("false")});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser common_peg_parser_builder::json_null() {
|
||||
return rule("json-null", [this]() {
|
||||
return sequence({literal("null"), space()});
|
||||
return literal("null");
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1334,8 +1334,7 @@ common_peg_parser common_peg_parser_builder::json_object() {
|
||||
choice({
|
||||
literal("}"),
|
||||
sequence({members, ws, literal("}")})
|
||||
}),
|
||||
ws
|
||||
})
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -1350,8 +1349,7 @@ common_peg_parser common_peg_parser_builder::json_array() {
|
||||
choice({
|
||||
literal("]"),
|
||||
sequence({elements, ws, literal("]")})
|
||||
}),
|
||||
ws
|
||||
})
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -1381,16 +1379,13 @@ common_peg_parser common_peg_parser_builder::python_number() {
|
||||
|
||||
common_peg_parser common_peg_parser_builder::python_bool() {
|
||||
return rule("python-bool", [this]() {
|
||||
return sequence({
|
||||
choice({literal("True"), literal("False")}),
|
||||
space()
|
||||
});
|
||||
return choice({literal("True"), literal("False")});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser common_peg_parser_builder::python_null() {
|
||||
return rule("python-none", [this]() {
|
||||
return sequence({literal("None"), space()});
|
||||
return literal("None");
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1512,6 +1507,7 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
|
||||
auto pieces = matcher.collect_prefix_and_next();
|
||||
|
||||
std::string pattern;
|
||||
std::string trailing; // optional proper-prefix of a delimiter, allowed only at the very end
|
||||
for (size_t i = 0; i < pieces.size(); ++i) {
|
||||
if (i > 0) {
|
||||
pattern += " | ";
|
||||
@@ -1527,13 +1523,32 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
|
||||
}
|
||||
|
||||
if (!pre.empty()) {
|
||||
pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
|
||||
std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
|
||||
pattern += pre_literal + " [^" + cls + "]";
|
||||
// Each interior alternative consumes a delimiter-prefix plus a disambiguating
|
||||
// char, so the repetition alone cannot match a value that *ends* on a proper
|
||||
// prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
|
||||
// "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
|
||||
// values, so without this the grammar would reject input the parser accepts.
|
||||
// Allow the value to terminate on any proper prefix as an optional tail.
|
||||
// This makes the grammar a slight superset of the runtime language (a value
|
||||
// may end on the longest prefix, which greedy first-match would not itself
|
||||
// produce); harmless for constrained generation, which only needs to admit
|
||||
// every runtime-valid string.
|
||||
if (!trailing.empty()) {
|
||||
trailing += " | ";
|
||||
}
|
||||
trailing += pre_literal;
|
||||
} else {
|
||||
pattern += "[^" + cls + "]";
|
||||
}
|
||||
}
|
||||
|
||||
return "(" + pattern + ")*";
|
||||
std::string result = "(" + pattern + ")*";
|
||||
if (!trailing.empty()) {
|
||||
result += " (" + trailing + ")?";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::unordered_set<std::string> collect_reachable_rules(
|
||||
|
||||
+30
-2
@@ -1,12 +1,40 @@
|
||||
# Install pre-built version of llama.cpp
|
||||
|
||||
| Install via | Windows | Mac | Linux |
|
||||
|-------------|---------|-----|-------|
|
||||
| Install via | Windows | Mac | Linux |
|
||||
|-------------|---------|------|-------|
|
||||
| conda-forge | ✅ | ✅ | ✅ |
|
||||
| Winget | ✅ | | |
|
||||
| Homebrew | | ✅ | ✅ |
|
||||
| MacPorts | | ✅ | |
|
||||
| Nix | | ✅ | ✅ |
|
||||
|
||||
## conda-forge (Windows, Mac and Linux)
|
||||
|
||||
conda-forge provides builds for:
|
||||
- CUDA (Windows and Linux)
|
||||
- Vulkan (Windows and Linux)
|
||||
- Apple Metal (macOS)
|
||||
|
||||
```sh
|
||||
conda install -c conda-forge llama-cpp
|
||||
```
|
||||
|
||||
```sh
|
||||
mamba install -c conda-forge llama-cpp
|
||||
```
|
||||
|
||||
```sh
|
||||
# Project-local installation
|
||||
pixi add llama-cpp
|
||||
|
||||
# Global installation
|
||||
pixi global install llama-cpp
|
||||
```
|
||||
|
||||
This distribution is managed on [`conda-forge/llama-cpp-feedstock`](https://github.com/conda-forge/llama.cpp-feedstock/).
|
||||
|
||||
Shall you have any problems, please open an issue on [its issue tracker](https://github.com/conda-forge/llama.cpp-feedstock/issues).
|
||||
|
||||
## Winget (Windows)
|
||||
|
||||
```sh
|
||||
|
||||
+2
-1
@@ -24,6 +24,7 @@ Legend:
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -77,7 +78,7 @@ Legend:
|
||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
|
||||
| PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
+227
-84
@@ -582,42 +582,42 @@
|
||||
"SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL"
|
||||
"SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[768,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL"
|
||||
@@ -914,57 +914,58 @@
|
||||
"SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=0,p1=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=1,p1=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=1,p1=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[3000,384,1,1],ne_kernel=[3,384,384,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=0,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=0,p1=0,d0=3,d1=0,is_2D=0","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=3,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL"
|
||||
@@ -1050,6 +1051,8 @@
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[128,128,1,2],ne_kernel=[32,33,1,2],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[128,128,2,1],ne_kernel=[33,34,2,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL"
|
||||
@@ -5047,6 +5050,39 @@
|
||||
"SYCL0","CONV_TRANSPOSE_1D","ne_input=[3,2,1,1],ne_kernel=[3,2,2,1],s0=1,p0=0,d0=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONV_TRANSPOSE_1D","ne_input=[3,2,1,1],ne_kernel=[3,1,2,1],s0=1,p0=0,d0=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f32,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=f16,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL"
|
||||
"SYCL0","COL2IM_1D","type=bf16,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL"
|
||||
"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","SYCL"
|
||||
"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","SYCL"
|
||||
"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","SYCL"
|
||||
@@ -6185,6 +6221,7 @@
|
||||
"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=64,n=1,k=64,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=256,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=512,n=1,k=512,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=32,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=4,k=128,bs=[2,3],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL"
|
||||
@@ -7603,6 +7640,31 @@
|
||||
"SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=16,n_used=16,b=1,m=50,n=200,k=64","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=1,n_used=1,b=0,m=8,n=16,k=1","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=32,n_used=2,b=0,m=2880,n=32,k=2880","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=bf16,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q4_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q4_1,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q5_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q5_1,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q8_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=384","support","0","no","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=192","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q2_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q3_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q4_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q5_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=q6_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq2_xxs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq2_xs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq2_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq3_xxs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq1_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq1_m,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq4_nl,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq3_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=iq4_xs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","SYCL"
|
||||
"SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","SYCL"
|
||||
@@ -10845,37 +10907,117 @@
|
||||
"SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROPE","type=f16,ne_a=[128,32,2,3],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","SYCL"
|
||||
@@ -16515,6 +16657,7 @@
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=256,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q4_0,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=128,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
|
||||
|
Can't render this file because it is too large.
|
+35
-5
@@ -3,15 +3,45 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
print_usage() {
|
||||
echo "Usage: ./build.sh [fp32|fp16] [--help]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " fp32 Build with FP32 precision (default)"
|
||||
echo " fp16 Build with FP16 precision (faster for long-prompt inference)"
|
||||
echo " --help Print this help message"
|
||||
}
|
||||
|
||||
PRECISION=fp32
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--help)
|
||||
print_usage
|
||||
exit 0
|
||||
;;
|
||||
fp32|fp16)
|
||||
PRECISION="$arg"
|
||||
;;
|
||||
*)
|
||||
echo "Error: unknown option '$arg'"
|
||||
print_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
|
||||
if [ "$PRECISION" = "fp16" ]; then
|
||||
#for FP16
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
|
||||
else
|
||||
#for FP32
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
|
||||
fi
|
||||
|
||||
#build example/main
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
@@ -3,6 +3,23 @@
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
IF /I "%1"=="--help" (
|
||||
echo Usage: win-build-sycl.bat [fp32^|fp16] [--help]
|
||||
echo.
|
||||
echo Options:
|
||||
echo fp32 Build with FP32 precision ^(default^)
|
||||
echo fp16 Build with FP16 precision ^(faster for long-prompt inference^)
|
||||
echo --help Print this help message
|
||||
exit /B 0
|
||||
)
|
||||
|
||||
SET PRECISION=%1
|
||||
IF "%PRECISION%"=="" SET PRECISION=fp32
|
||||
IF /I NOT "%PRECISION%"=="fp32" IF /I NOT "%PRECISION%"=="fp16" (
|
||||
echo Error: invalid value '%PRECISION%'. Use 'fp32' or 'fp16'.
|
||||
echo Usage: win-build-sycl.bat [fp32^|fp16] [--help]
|
||||
exit /B 1
|
||||
)
|
||||
|
||||
IF not exist build (mkdir build)
|
||||
cd build
|
||||
@@ -11,12 +28,14 @@ if %errorlevel% neq 0 goto ERROR
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
if %errorlevel% neq 0 goto ERROR
|
||||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
IF /I "%PRECISION%"=="fp16" (
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
) ELSE (
|
||||
:: for FP32
|
||||
cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
)
|
||||
if %errorlevel% neq 0 goto ERROR
|
||||
|
||||
:: build all binary
|
||||
|
||||
@@ -293,7 +293,6 @@
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
|
||||
@@ -1418,6 +1418,9 @@ typedef decltype(kernel_repeat<float>) kernel_repeat_t;
|
||||
|
||||
template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
|
||||
template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_repeat_bf16")]] kernel kernel_repeat_t kernel_repeat<bfloat>;
|
||||
#endif
|
||||
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ bool gpu_has_xmx(sycl::device &dev) {
|
||||
return dev.has(sycl::aspect::ext_intel_matrix);
|
||||
}
|
||||
|
||||
static int ggml_sycl_get_env(const char *env_name, int default_val) {
|
||||
int ggml_sycl_get_env(const char *env_name, int default_val) {
|
||||
char *user_device_string = getenv(env_name);
|
||||
int user_number = default_val;
|
||||
|
||||
@@ -86,7 +86,7 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
|
||||
return ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1) &&
|
||||
return g_ggml_sycl_enable_level_zero &&
|
||||
q.get_device().is_gpu() &&
|
||||
q.get_backend() == sycl::backend::ext_oneapi_level_zero;
|
||||
}
|
||||
@@ -94,8 +94,6 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
|
||||
|
||||
// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
|
||||
// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
|
||||
// The decision is made from the queue and runtime env because large buffers can be
|
||||
// allocated before ggml_check_sycl() initializes g_ggml_sycl_enable_level_zero.
|
||||
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
if (ggml_sycl_use_level_zero_device_alloc(q)) {
|
||||
|
||||
@@ -225,6 +225,7 @@ struct sycl_device_info {
|
||||
int max_wg_per_cu; // max work groups per compute unit - refer to
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
bool vmm; // virtual memory support
|
||||
bool l0_discrete_gpu; // Level Zero backend and not an integrated GPU
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
sycl_hw_info hw_info;
|
||||
@@ -644,6 +645,8 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
||||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
|
||||
int ggml_sycl_get_env(const char *env_name, int default_val);
|
||||
|
||||
template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return "";
|
||||
|
||||
@@ -48,6 +48,287 @@ inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
}
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_q1_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q1_0 * dsti = (block_q1_0 *) cdsti;
|
||||
|
||||
float sum_abs = 0.0f;
|
||||
for (int j = 0; j < QK1_0; ++j) {
|
||||
sum_abs += sycl::fabs((float) xi[j]);
|
||||
}
|
||||
|
||||
dsti->d = sum_abs / QK1_0;
|
||||
|
||||
for (int j = 0; j < QK1_0 / 8; ++j) {
|
||||
dsti->qs[j] = 0;
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK1_0; ++j) {
|
||||
if (xi[j] >= 0.0f) {
|
||||
dsti->qs[j / 8] |= (1u << (j % 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline int best_index_mxfp4(const float x, const float e) {
|
||||
int best_index = 0;
|
||||
float best_err = sycl::fabs((float) (kvalues_mxfp4[0] * e - x));
|
||||
for (int i = 1; i < 16; ++i) {
|
||||
const float err = sycl::fabs((float) (kvalues_mxfp4[i] * e - x));
|
||||
if (err < best_err) {
|
||||
best_index = i;
|
||||
best_err = err;
|
||||
}
|
||||
}
|
||||
return best_index;
|
||||
}
|
||||
|
||||
inline int nearest_int_sycl(float x) {
|
||||
const float val = x + 12582912.0f;
|
||||
int i;
|
||||
memcpy(&i, &val, sizeof(int));
|
||||
return (i & 0x007fffff) - 0x00400000;
|
||||
}
|
||||
|
||||
inline int nearest_int_ggml_sycl(float x) {
|
||||
return (int) sycl::round((float) x);
|
||||
}
|
||||
|
||||
inline uint8_t clamp_u8(const int x, const int lo, const int hi) {
|
||||
return (uint8_t) dpct::max(lo, dpct::min(hi, x));
|
||||
}
|
||||
|
||||
inline int8_t clamp_i8(const int x, const int lo, const int hi) {
|
||||
return (int8_t) dpct::max(lo, dpct::min(hi, x));
|
||||
}
|
||||
|
||||
constexpr float GROUP_MAX_EPS_SYCL = 1e-15f;
|
||||
|
||||
inline float make_qx_quants_sycl(int n, int nmax, const float * x, int8_t * L, int rmse_type, const float * qw) {
|
||||
float max = 0.0f;
|
||||
float amax = 0.0f;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const float ax = sycl::fabs(x[i]);
|
||||
if (ax > amax) {
|
||||
amax = ax;
|
||||
max = x[i];
|
||||
}
|
||||
}
|
||||
if (amax < GROUP_MAX_EPS_SYCL) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
L[i] = 0;
|
||||
}
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
float iscale = -nmax / max;
|
||||
if (rmse_type == 0) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int_ggml_sycl(iscale * x[i]);
|
||||
L[i] = (int8_t) (nmax + dpct::max(-nmax, dpct::min(nmax - 1, l)));
|
||||
}
|
||||
return 1.0f / iscale;
|
||||
}
|
||||
|
||||
bool return_early = false;
|
||||
if (rmse_type < 0) {
|
||||
rmse_type = -rmse_type;
|
||||
return_early = true;
|
||||
}
|
||||
|
||||
float sumlx = 0.0f;
|
||||
float suml2 = 0.0f;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int_ggml_sycl(iscale * x[i]);
|
||||
l = dpct::max(-nmax, dpct::min(nmax - 1, l));
|
||||
L[i] = (int8_t) (l + nmax);
|
||||
|
||||
const float w = qw ? qw[i] : (rmse_type == 1 ? x[i] * x[i] :
|
||||
rmse_type == 2 ? 1.0f : rmse_type == 3 ? sycl::fabs(x[i]) : sycl::sqrt(sycl::fabs(x[i])));
|
||||
|
||||
sumlx += w * x[i] * l;
|
||||
suml2 += w * l * l;
|
||||
}
|
||||
|
||||
float scale = suml2 ? sumlx / suml2 : 0.0f;
|
||||
if (return_early) {
|
||||
return suml2 > 0.0f ? 0.5f * (scale + 1.0f / iscale) : 1.0f / iscale;
|
||||
}
|
||||
|
||||
float best = scale * sumlx;
|
||||
for (int is = -9; is <= 9; ++is) {
|
||||
if (is == 0) {
|
||||
continue;
|
||||
}
|
||||
iscale = -(nmax + 0.1f * is) / max;
|
||||
sumlx = 0.0f;
|
||||
suml2 = 0.0f;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int_ggml_sycl(iscale * x[i]);
|
||||
l = dpct::max(-nmax, dpct::min(nmax - 1, l));
|
||||
const float w = qw ? qw[i] : (rmse_type == 1 ? x[i] * x[i] :
|
||||
rmse_type == 2 ? 1.0f : rmse_type == 3 ? sycl::fabs(x[i]) : sycl::sqrt(sycl::fabs(x[i])));
|
||||
sumlx += w * x[i] * l;
|
||||
suml2 += w * l * l;
|
||||
}
|
||||
|
||||
if (suml2 > 0.0f && sumlx * sumlx > best * suml2) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int_ggml_sycl(iscale * x[i]);
|
||||
L[i] = (int8_t) (nmax + dpct::max(-nmax, dpct::min(nmax - 1, l)));
|
||||
}
|
||||
scale = sumlx / suml2;
|
||||
best = scale * sumlx;
|
||||
}
|
||||
}
|
||||
|
||||
return scale;
|
||||
}
|
||||
|
||||
inline float make_q3_quants_sycl(int n, int nmax, const float * x, int8_t * L, bool do_rmse) {
|
||||
float max = 0.0f;
|
||||
float amax = 0.0f;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const float ax = sycl::fabs(x[i]);
|
||||
if (ax > amax) {
|
||||
amax = ax;
|
||||
max = x[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (amax < GROUP_MAX_EPS_SYCL) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
L[i] = 0;
|
||||
}
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
const float iscale = -nmax / max;
|
||||
if (do_rmse) {
|
||||
float sumlx = 0.0f;
|
||||
float suml2 = 0.0f;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int_ggml_sycl(iscale * x[i]);
|
||||
l = dpct::max(-nmax, dpct::min(nmax - 1, l));
|
||||
L[i] = (int8_t) l;
|
||||
const float w = x[i] * x[i];
|
||||
sumlx += w * x[i] * l;
|
||||
suml2 += w * l * l;
|
||||
}
|
||||
|
||||
for (int itry = 0; itry < 5; ++itry) {
|
||||
int n_changed = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const float w = x[i] * x[i];
|
||||
float slx = sumlx - w * x[i] * L[i];
|
||||
if (slx > 0.0f) {
|
||||
float sl2 = suml2 - w * L[i] * L[i];
|
||||
int new_l = nearest_int_ggml_sycl(x[i] * sl2 / slx);
|
||||
new_l = dpct::max(-nmax, dpct::min(nmax - 1, new_l));
|
||||
if (new_l != L[i]) {
|
||||
slx += w * x[i] * new_l;
|
||||
sl2 += w * new_l * new_l;
|
||||
if (sl2 > 0.0f && slx * slx * suml2 > sumlx * sumlx * sl2) {
|
||||
L[i] = (int8_t) new_l;
|
||||
sumlx = slx;
|
||||
suml2 = sl2;
|
||||
++n_changed;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!n_changed) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
L[i] += nmax;
|
||||
}
|
||||
return suml2 > 0.0f ? sumlx / suml2 : 0.0f;
|
||||
}
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int_ggml_sycl(iscale * x[i]);
|
||||
l = dpct::max(-nmax, dpct::min(nmax - 1, l));
|
||||
L[i] = (int8_t) (l + nmax);
|
||||
}
|
||||
|
||||
return 1.0f / iscale;
|
||||
}
|
||||
|
||||
inline void set_scale_min_k4(int j, uint8_t * q, uint8_t d, uint8_t m) {
|
||||
if (j < 4) {
|
||||
q[j] = (q[j] & 0xC0) | (d & 0x3F);
|
||||
q[j + 4] = (q[j + 4] & 0xC0) | (m & 0x3F);
|
||||
} else {
|
||||
q[j + 4] = (d & 0x0F) | ((m & 0x0F) << 4);
|
||||
q[j - 4] = (q[j - 4] & 0x3F) | ((d >> 4) << 6);
|
||||
q[j - 0] = (q[j - 0] & 0x3F) | ((m >> 4) << 6);
|
||||
}
|
||||
}
|
||||
|
||||
inline void get_scale_min_k4_local(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||
if (j < 4) {
|
||||
d = q[j] & 63;
|
||||
m = q[j + 4] & 63;
|
||||
} else {
|
||||
d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
|
||||
m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
|
||||
}
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_mxfp4(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_mxfp4 * dsti = (block_mxfp4 *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < QK_MXFP4; ++j) {
|
||||
amax = sycl::fmax(amax, sycl::fabs((float) xi[j]));
|
||||
}
|
||||
|
||||
const uint8_t e = amax > 0.0f ? (uint8_t) (sycl::floor(sycl::log2(amax)) - 2 + 127) : 0;
|
||||
const float d = GGML_E8M0_TO_FP32_HALF(e);
|
||||
|
||||
dsti->e = e;
|
||||
|
||||
for (int j = 0; j < QK_MXFP4 / 2; ++j) {
|
||||
const uint8_t x0 = best_index_mxfp4(xi[0 + j], d);
|
||||
const uint8_t x1 = best_index_mxfp4(xi[QK_MXFP4 / 2 + j], d);
|
||||
|
||||
dsti->qs[j] = x0;
|
||||
dsti->qs[j] |= x1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_nvfp4(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_nvfp4 * dsti = (block_nvfp4 *) cdsti;
|
||||
|
||||
constexpr int n_sub = QK_NVFP4 / QK_NVFP4_SUB;
|
||||
|
||||
for (int s = 0; s < n_sub; ++s) {
|
||||
const float * xb = xi + s * QK_NVFP4_SUB;
|
||||
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < QK_NVFP4_SUB; ++j) {
|
||||
amax = sycl::fmax(amax, sycl::fabs((float) xb[j]));
|
||||
}
|
||||
|
||||
const uint8_t ue = ggml_fp32_to_ue4m3(amax / 6.0f);
|
||||
dsti->d[s] = ue;
|
||||
const float d = ggml_ue4m3_to_fp32(ue);
|
||||
|
||||
for (int j = 0; j < QK_NVFP4_SUB / 2; ++j) {
|
||||
const uint8_t x0 = best_index_mxfp4(xb[0 + j], d);
|
||||
const uint8_t x1 = best_index_mxfp4(xb[QK_NVFP4_SUB / 2 + j], d);
|
||||
|
||||
dsti->qs[s * (QK_NVFP4_SUB / 2) + j] = x0 | (x1 << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
||||
|
||||
+318
-333
File diff suppressed because it is too large
Load Diff
@@ -70,6 +70,7 @@
|
||||
#include "ggml-sycl/diag.hpp"
|
||||
#include "ggml-sycl/solve_tri.hpp"
|
||||
#include "ggml-sycl/gated_delta_net.hpp"
|
||||
#include "ggml-sycl/pool.hpp"
|
||||
|
||||
static bool g_sycl_loaded = false;
|
||||
int g_ggml_sycl_debug = 0;
|
||||
@@ -147,11 +148,31 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
|
||||
info.ext_oneapi_level_zero = false;
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) {
|
||||
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device.default_queue().get_device());
|
||||
ze_device_properties_t props = {};
|
||||
props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
|
||||
info.devices[i].l0_discrete_gpu = r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
info.default_tensor_split[id] /= total_vram;
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
// Large buffers can be allocated before ggml_check_sycl() initializes other
|
||||
// g_ggml_sycl_enable_* globals, so initialize this one as early as we can.
|
||||
g_ggml_sycl_enable_level_zero =
|
||||
info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
|
||||
#else
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
#endif
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
@@ -236,38 +257,19 @@ void ggml_backend_sycl_print_sycl_devices() {
|
||||
print_device_opt_feature(device_count);
|
||||
}
|
||||
|
||||
static inline int get_sycl_env(const char *env_name, int default_val) {
|
||||
char *user_device_string = getenv(env_name);
|
||||
int user_number = default_val;
|
||||
|
||||
unsigned n;
|
||||
if (user_device_string != NULL &&
|
||||
sscanf(user_device_string, " %u", &n) == 1) {
|
||||
user_number = (int)n;
|
||||
} else {
|
||||
user_number = default_val;
|
||||
}
|
||||
return user_number;
|
||||
}
|
||||
|
||||
static void ggml_check_sycl() try {
|
||||
static bool initialized = false;
|
||||
|
||||
if (!initialized) {
|
||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
|
||||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
|
||||
#else
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
#endif
|
||||
g_ggml_sycl_debug = ggml_sycl_get_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize = ggml_sycl_get_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_graph = ggml_sycl_get_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = ggml_sycl_get_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_enable_vmm = ggml_sycl_get_env("GGML_SYCL_ENABLE_VMM", 1);
|
||||
g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
||||
g_ggml_sycl_enable_flash_attention = ggml_sycl_get_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
||||
#else
|
||||
g_ggml_sycl_enable_flash_attention = 0;
|
||||
#endif
|
||||
@@ -330,7 +332,7 @@ static void ggml_check_sycl() try {
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
|
||||
#endif
|
||||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
||||
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
g_ggml_sycl_use_async_mem_op_requested = ggml_sycl_get_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
@@ -569,26 +571,18 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
|
||||
if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
|
||||
ze_device_properties_t props = {};
|
||||
props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
|
||||
return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
|
||||
static bool ggml_sycl_is_l0_discrete_gpu(int device) {
|
||||
return ggml_sycl_info().devices[device].l0_discrete_gpu;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
||||
static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
|
||||
const void *ptr_src, size_t size) {
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
|
||||
const bool l0_copy_supported =
|
||||
ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
|
||||
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
|
||||
const bool l0_copy_supported = g_ggml_sycl_enable_level_zero &&
|
||||
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
|
||||
if (l0_copy_supported) {
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
|
||||
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
|
||||
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
|
||||
@@ -651,7 +645,7 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
size_t size = ggml_nbytes(src);
|
||||
|
||||
//todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs.
|
||||
dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size);
|
||||
dev2dev_memcpy(dst_ctx->device, *stream_dst, src_ctx->device, *stream_src, dst->data, src->data, size);
|
||||
|
||||
//todo, it's known issue:error in device2device cross GPUs. reused when the issue is fixed. DON"T remove
|
||||
#if 0
|
||||
@@ -1947,69 +1941,6 @@ static void scale_f32(const float * x, float * dst, const float scale, const flo
|
||||
}
|
||||
|
||||
|
||||
template <typename Ti, typename To>
|
||||
static void pool2d_nchw_kernel(
|
||||
const int ih, const int iw, const int oh, const int ow,
|
||||
const int kh, const int kw, const int sh, const int sw,
|
||||
const int ph, const int pw, const int parallel_elements,
|
||||
const Ti* src, To* dst, const enum ggml_op_pool op,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (idx >= parallel_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int I_HW = ih * iw;
|
||||
const int O_HW = oh * ow;
|
||||
const int nc = idx / O_HW;
|
||||
const int cur_oh = idx % O_HW / ow;
|
||||
const int cur_ow = idx % O_HW % ow;
|
||||
const Ti* i_ptr = src + nc * I_HW;
|
||||
To* o_ptr = dst + nc * O_HW;
|
||||
const int start_h = cur_oh * sh - ph;
|
||||
const int bh = sycl::max(0, start_h);
|
||||
const int eh = sycl::min(ih, start_h + kh);
|
||||
const int start_w = cur_ow * sw - pw;
|
||||
const int bw = sycl::max(0, start_w);
|
||||
const int ew = sycl::min(iw, start_w + kw);
|
||||
|
||||
To res = 0;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res = 0; break;
|
||||
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
||||
default:
|
||||
res = (To) sycl::nan(uint32_t(0));
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = bh; i < eh; i += 1) {
|
||||
for (int j = bw; j < ew; j += 1) {
|
||||
#if DPCT_COMPATIBILITY_TEMP >= 350
|
||||
/*
|
||||
DPCT1098:106: The '*' expression is used instead of the __ldg
|
||||
call. These two expressions do not provide the exact same
|
||||
functionality. Check the generated code for potential precision
|
||||
and/or performance issues.
|
||||
*/
|
||||
Ti cur = *(i_ptr + i * iw + j);
|
||||
#else
|
||||
Ti cur = i_ptr[i * iw + j];
|
||||
#endif
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
|
||||
case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
|
||||
default:
|
||||
res = (To) sycl::nan(uint32_t(0));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
|
||||
static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
||||
float *dst, const int ncols_x,
|
||||
const int nrows_x,
|
||||
@@ -2558,45 +2489,6 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
||||
float * dst_dd = static_cast<float *>(dst->data);
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
||||
const int k0 = opts[1];
|
||||
const int k1 = opts[2];
|
||||
const int s0 = opts[3];
|
||||
const int s1 = opts[4];
|
||||
const int p0 = opts[5];
|
||||
const int p1 = opts[6];
|
||||
|
||||
const int64_t IH = dst->src[0]->ne[1];
|
||||
const int64_t IW = dst->src[0]->ne[0];
|
||||
|
||||
const int64_t N = dst->ne[3];
|
||||
const int64_t OC = dst->ne[2];
|
||||
const int64_t OH = dst->ne[1];
|
||||
const int64_t OW = dst->ne[0];
|
||||
|
||||
const int parallel_elements = N * OC * OH * OW;
|
||||
const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE;
|
||||
sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
main_stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums *
|
||||
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0,
|
||||
parallel_elements, src0_dd, dst_dd, op,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
@@ -3056,7 +2948,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
src1_ddf_i_source += (i0 * ne11 + src1_col_0) * ne10;
|
||||
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source,
|
||||
CHECK_TRY_ERROR(dev2dev_memcpy(i, *stream, ctx.device, *main_stream, src1_ddf_i, src1_ddf_i_source,
|
||||
src1_ncols * ne10 * sizeof(float))));
|
||||
}
|
||||
}
|
||||
@@ -4435,6 +4327,11 @@ static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
ggml_sycl_op_pool2d(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_pool1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_pool1d(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_im2col(ctx, dst);
|
||||
@@ -4748,6 +4645,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_sycl_pool2d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_POOL_1D:
|
||||
ggml_sycl_pool1d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SUM:
|
||||
ggml_sycl_sum(ctx, dst);
|
||||
break;
|
||||
@@ -5342,10 +5242,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
|
||||
|
||||
auto res = ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
|
||||
op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 ||
|
||||
op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) &&
|
||||
op->type == GGML_TYPE_Q1_0 ||
|
||||
op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL ||
|
||||
op->type == GGML_TYPE_MXFP4 || op->type == GGML_TYPE_NVFP4) &&
|
||||
op->src[0]->type == GGML_TYPE_F32 &&
|
||||
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32));
|
||||
return res;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
@@ -5502,6 +5407,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
k > 0 && k <= 32;
|
||||
}
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_POOL_1D:
|
||||
return true;
|
||||
case GGML_OP_ACC:
|
||||
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
|
||||
+63
-67
@@ -662,13 +662,12 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy,
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
@@ -683,13 +682,13 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols(
|
||||
const int stride_col_y_bytes, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder_ncols<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>, ncols_dst>(
|
||||
vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item);
|
||||
@@ -1080,13 +1079,12 @@ static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy,
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
@@ -1101,13 +1099,13 @@ static void reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols(
|
||||
const int stride_col_y_bytes, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder_ncols<reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0>, ncols_dst>(
|
||||
vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item);
|
||||
@@ -1289,13 +1287,12 @@ static void reorder_mul_mat_vec_q3_k_q8_1_sycl(const void * vx, const void * vy,
|
||||
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q3_K>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
@@ -1310,13 +1307,13 @@ static void reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols(
|
||||
const int stride_col_y_bytes, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder_ncols<reorder_vec_dot_q_sycl<GGML_TYPE_Q3_K>, ncols_dst>(
|
||||
vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item);
|
||||
@@ -1457,13 +1454,12 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy,
|
||||
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
|
||||
nrows, nd_item);
|
||||
@@ -1478,13 +1474,14 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols(
|
||||
const int stride_col_y_bytes, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder_ncols<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>, ncols_dst>(
|
||||
vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item);
|
||||
@@ -1583,15 +1580,13 @@ static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(vx, vy, dst, ncols,
|
||||
nrows, nd_item);
|
||||
@@ -1606,13 +1601,14 @@ static void reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols(
|
||||
const int stride_col_y_bytes, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder_ncols<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>, ncols_dst>(
|
||||
vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item);
|
||||
@@ -1643,13 +1639,13 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy,
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
@@ -1664,13 +1660,13 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols(
|
||||
const int stride_col_y_bytes, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups);
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder_ncols<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>, ncols_dst>(
|
||||
vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item);
|
||||
|
||||
@@ -0,0 +1,185 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2026 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include "pool.hpp"
|
||||
#include <float.h>
|
||||
|
||||
template <typename Ti, typename To>
|
||||
static void pool2d_nchw_kernel(
|
||||
const int ih, const int iw, const int oh, const int ow,
|
||||
const int kh, const int kw, const int sh, const int sw,
|
||||
const int ph, const int pw, const int parallel_elements,
|
||||
const Ti* src, To* dst, const enum ggml_op_pool op,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (idx >= parallel_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int I_HW = ih * iw;
|
||||
const int O_HW = oh * ow;
|
||||
const int nc = idx / O_HW;
|
||||
const int cur_oh = idx % O_HW / ow;
|
||||
const int cur_ow = idx % O_HW % ow;
|
||||
const Ti* i_ptr = src + nc * I_HW;
|
||||
To* o_ptr = dst + nc * O_HW;
|
||||
const int start_h = cur_oh * sh - ph;
|
||||
const int bh = sycl::max(0, start_h);
|
||||
const int eh = sycl::min(ih, start_h + kh);
|
||||
const int start_w = cur_ow * sw - pw;
|
||||
const int bw = sycl::max(0, start_w);
|
||||
const int ew = sycl::min(iw, start_w + kw);
|
||||
|
||||
To res = 0;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res = 0; break;
|
||||
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
||||
default:
|
||||
res = (To) sycl::nan(uint32_t(0));
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = bh; i < eh; i += 1) {
|
||||
for (int j = bw; j < ew; j += 1) {
|
||||
Ti cur = i_ptr[i * iw + j];
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
|
||||
case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
|
||||
default:
|
||||
res = (To) sycl::nan(uint32_t(0));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
template <typename Ti, typename To>
|
||||
static void pool1d_ncw_kernel(
|
||||
const int iw, const int ow,
|
||||
const int k, const int s,
|
||||
const int p, const int parallel_elements,
|
||||
const Ti * src, To * dst, const enum ggml_op_pool op,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
int idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (idx >= parallel_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int nc = idx / ow;
|
||||
const int cur_ow = idx % ow;
|
||||
const Ti * i_ptr = src + nc * iw;
|
||||
To * o_ptr = dst + nc * ow;
|
||||
const int start = cur_ow * s - p;
|
||||
const int b = sycl::max(0, start);
|
||||
const int e = sycl::min(iw, start + k);
|
||||
|
||||
To res = 0;
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res = 0; break;
|
||||
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
||||
default:
|
||||
res = (To) sycl::nan(uint32_t(0));
|
||||
break;
|
||||
}
|
||||
|
||||
for (int j = b; j < e; j += 1) {
|
||||
Ti cur = i_ptr[j];
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res += cur; break;
|
||||
case GGML_OP_POOL_MAX: res = sycl::max(res, (To) cur); break;
|
||||
default:
|
||||
res = (To) sycl::nan(uint32_t(0));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const int count = e - b;
|
||||
if (op == GGML_OP_POOL_AVG) {
|
||||
res = (count > 0) ? (res / count) : (To) 0;
|
||||
}
|
||||
o_ptr[cur_ow] = res;
|
||||
}
|
||||
|
||||
void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
||||
float * dst_dd = static_cast<float *>(dst->data);
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
||||
const int k0 = opts[1];
|
||||
const int k1 = opts[2];
|
||||
const int s0 = opts[3];
|
||||
const int s1 = opts[4];
|
||||
const int p0 = opts[5];
|
||||
const int p1 = opts[6];
|
||||
|
||||
const int64_t IH = dst->src[0]->ne[1];
|
||||
const int64_t IW = dst->src[0]->ne[0];
|
||||
|
||||
const int64_t N = dst->ne[3];
|
||||
const int64_t OC = dst->ne[2];
|
||||
const int64_t OH = dst->ne[1];
|
||||
const int64_t OW = dst->ne[0];
|
||||
|
||||
const int parallel_elements = N * OC * OH * OW;
|
||||
const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE;
|
||||
sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
main_stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums *
|
||||
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0,
|
||||
parallel_elements, src0_dd, dst_dd, op,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_pool1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
||||
float * dst_dd = static_cast<float *>(dst->data);
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
||||
const int k0 = opts[1];
|
||||
const int s0 = opts[2];
|
||||
const int p0 = opts[3];
|
||||
|
||||
const int64_t IW = dst->src[0]->ne[0];
|
||||
const int64_t OW = dst->ne[0];
|
||||
const int64_t NC = dst->ne[3] * dst->ne[2] * dst->ne[1];
|
||||
|
||||
const int parallel_elements = NC * OW;
|
||||
const int num_blocks = (parallel_elements + SYCL_POOL1D_BLOCK_SIZE - 1) / SYCL_POOL1D_BLOCK_SIZE;
|
||||
sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
main_stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums *
|
||||
sycl::range<3>(1, 1, SYCL_POOL1D_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_POOL1D_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
pool1d_ncw_kernel(IW, OW, k0, s0, p0,
|
||||
parallel_elements, src0_dd, dst_dd, op,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2026 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_POOL_HPP
|
||||
#define GGML_SYCL_POOL_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
#include "presets.hpp"
|
||||
|
||||
void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
void ggml_sycl_op_pool1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_POOL_HPP
|
||||
@@ -46,6 +46,7 @@
|
||||
#define SYCL_PAD_BLOCK_SIZE 256
|
||||
#define SYCL_ACC_BLOCK_SIZE 256
|
||||
#define SYCL_IM2COL_BLOCK_SIZE 256
|
||||
#define SYCL_POOL1D_BLOCK_SIZE 256
|
||||
#define SYCL_POOL2D_BLOCK_SIZE 256
|
||||
#define SYCL_ARGMAX_BLOCK_SIZE 256
|
||||
#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
|
||||
|
||||
@@ -135,7 +135,7 @@ static void set_rows_sycl(
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(grid_size * block_size, block_size),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
[=](sycl::nd_item<1> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
k_set_rows<TIn, TIdx, TOut>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, ne01, ne02,
|
||||
@@ -202,6 +202,9 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s
|
||||
case GGML_TYPE_Q8_0:
|
||||
set_rows_sycl_q<TIdx, block_q8_0, QK8_0, cpy_blck_f32_q8_0>(src0_d, src1_d, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q1_0:
|
||||
set_rows_sycl_q<TIdx, block_q1_0, QK1_0, cpy_blck_f32_q1_0>(src0_d, src1_d, (block_q1_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
set_rows_sycl_q<TIdx, block_q5_1, QK5_1, cpy_blck_f32_q5_1>(src0_d, src1_d, (block_q5_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
@@ -217,7 +220,12 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
set_rows_sycl_q<TIdx, block_iq4_nl, QK4_NL, cpy_blck_f32_iq4_nl>(src0_d, src1_d, (block_iq4_nl *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_MXFP4:
|
||||
set_rows_sycl_q<TIdx, block_mxfp4, QK_MXFP4, cpy_blck_f32_mxfp4>(src0_d, src1_d, (block_mxfp4 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_NVFP4:
|
||||
set_rows_sycl_q<TIdx, block_nvfp4, QK_NVFP4, cpy_blck_f32_nvfp4>(src0_d, src1_d, (block_nvfp4 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported tensor type!");
|
||||
break;
|
||||
|
||||
@@ -56,7 +56,7 @@ static void soft_max_f32(const float * x,
|
||||
: block_size_template;
|
||||
const int nthreads = block_size;
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
size_t nreduce = nwarps / WARP_SIZE;
|
||||
const size_t nreduce = nwarps / WARP_SIZE;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
|
||||
@@ -105,17 +105,15 @@ static void soft_max_f32(const float * x,
|
||||
max_val = warp_reduce_max<WARP_SIZE>(max_val);
|
||||
|
||||
if (block_size > WARP_SIZE) {
|
||||
if (warp_id == 0) {
|
||||
buf_iw[lane_id] = -INFINITY;
|
||||
}
|
||||
item_ct1.barrier();
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf_iw[warp_id] = max_val;
|
||||
}
|
||||
item_ct1.barrier();
|
||||
|
||||
max_val = buf_iw[lane_id];
|
||||
max_val = -INFINITY;
|
||||
for (int i = lane_id; i < nwarps; i += WARP_SIZE) {
|
||||
max_val = sycl::max(max_val, buf_iw[i]);
|
||||
}
|
||||
max_val = warp_reduce_max<WARP_SIZE>(max_val);
|
||||
}
|
||||
float tmp = 0.0f; // partial sum
|
||||
@@ -290,7 +288,8 @@ static void soft_max_f32_sycl(const float *x, const T *mask,
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
soft_max_f32<false, 0, 0>(
|
||||
x, mask, sinks, dst, params,
|
||||
dpct_local_acc_ct1
|
||||
|
||||
@@ -798,7 +798,7 @@ struct vk_device_struct {
|
||||
|
||||
vk_pipeline pipeline_add_id_f32;
|
||||
|
||||
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
|
||||
vk_pipeline pipeline_concat_i8, pipeline_concat_i16, pipeline_concat_i32, pipeline_concat_i64;
|
||||
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bicubic_f32, pipeline_upscale_bilinear_antialias_f32;
|
||||
vk_pipeline pipeline_scale_f32;
|
||||
vk_pipeline pipeline_sqr_f32;
|
||||
@@ -4996,9 +4996,10 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_f32, "set_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 0}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_i8, "concat_i8", concat_i8_len, concat_i8_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_i16, "concat_i16", concat_i16_len, concat_i16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_i64, "concat_i64", concat_i64_len, concat_i64_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
|
||||
@@ -10318,17 +10319,27 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_add_id_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONCAT:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_concat_f32;
|
||||
case GGML_OP_CONCAT: {
|
||||
if (src0->type != src1->type || src0->type != dst->type) {
|
||||
return nullptr;
|
||||
}
|
||||
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_concat_f16;
|
||||
if (ggml_blck_size(src0->type) != 1) {
|
||||
return nullptr;
|
||||
}
|
||||
if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
||||
const size_t type_size = ggml_type_size(src0->type);
|
||||
switch (type_size) {
|
||||
case 1:
|
||||
return ctx->device->pipeline_concat_i8;
|
||||
case 2:
|
||||
return ctx->device->pipeline_concat_i16;
|
||||
case 4:
|
||||
return ctx->device->pipeline_concat_i32;
|
||||
case 8:
|
||||
return ctx->device->pipeline_concat_i64;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
case GGML_OP_UPSCALE:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
uint32_t mode = (ggml_get_op_params_i32(dst, 0) & (0xFF | GGML_SCALE_FLAG_ANTIALIAS));
|
||||
@@ -17042,8 +17053,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_SET:
|
||||
return op->src[0]->type == op->src[1]->type && op->src[0]->type == op->type &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_I32);
|
||||
case GGML_OP_CONCAT:
|
||||
return ggml_type_size(op->src[0]->type) == ggml_type_size(GGML_TYPE_F32);
|
||||
case GGML_OP_CONCAT: {
|
||||
if (op->src[0]->type != op->src[1]->type || op->src[0]->type != op->type) {
|
||||
return false;
|
||||
}
|
||||
const size_t type_size = ggml_type_size(op->type);
|
||||
return ggml_blck_size(op->type) == 1 &&
|
||||
(type_size == 1 || type_size == 2 || type_size == 4 || type_size == 8);
|
||||
}
|
||||
case GGML_OP_ADD1:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32)
|
||||
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32)
|
||||
|
||||
@@ -862,9 +862,10 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
||||
string_to_spv("concat_i8", "concat.comp", {{"A_TYPE", "uint8_t"}, {"B_TYPE", "uint8_t"}, {"D_TYPE", "uint8_t"}});
|
||||
string_to_spv("concat_i16", "concat.comp", {{"A_TYPE", "uint16_t"}, {"B_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
|
||||
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "uint"}, {"B_TYPE", "uint"}, {"D_TYPE", "uint"}});
|
||||
string_to_spv("concat_i64", "concat.comp", {{"A_TYPE", "uvec2"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "uvec2"}});
|
||||
|
||||
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ void test_gbnf_generation(testing &t) {
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
|
||||
root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])* ("<" | "</" | "</t" | "</ta" | "</tag")?
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
@@ -130,12 +130,12 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16) {
|
||||
// This is going to create some weird integers though.
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, nels * ggml_type_size(tensor->type));
|
||||
} else if (tensor->type == GGML_TYPE_I64) {
|
||||
// Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
|
||||
const size_t nbytes_half = ggml_nbytes(tensor)/2;
|
||||
const size_t nbytes_half = nels * sizeof(float);
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
|
||||
ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
|
||||
} else {
|
||||
|
||||
@@ -1369,7 +1369,7 @@ static void test_nemotron_tool_format(testing & t) {
|
||||
// Check argument markers (note: markers retain trailing newlines for proper parsing)
|
||||
t.assert_equal("arg_name_prefix should be '<parameter='", "<parameter=", analysis.tools.arguments.name_prefix);
|
||||
t.assert_equal("arg_name_suffix should be '>\\n'", ">\n", analysis.tools.arguments.name_suffix);
|
||||
t.assert_equal("arg_value_suffix should be '</parameter>\\n'", "</parameter>\n", analysis.tools.arguments.value_suffix);
|
||||
t.assert_equal("arg_value_suffix should be '\\n</parameter>\\n'", "\n</parameter>\n", analysis.tools.arguments.value_suffix);
|
||||
|
||||
// Check format classification
|
||||
t.assert_true("tool format should be TAG_WITH_TAGGED", analysis.tools.format.mode == tool_format::TAG_WITH_TAGGED);
|
||||
@@ -2030,12 +2030,11 @@ static void test_tagged_args_with_embedded_quotes(testing & t) {
|
||||
return p.content(p.until("<seed:tool_call>")) + p.optional(tool_section) + p.end();
|
||||
});
|
||||
|
||||
// The exact input from the failing test
|
||||
std::string input =
|
||||
"<seed:tool_call>\n"
|
||||
"<function=edit>\n"
|
||||
"<parameter=filename>\n"
|
||||
"foo.cpp\n"
|
||||
"<parameter=filename>"
|
||||
"foo.cpp"
|
||||
"</parameter>\n"
|
||||
"<parameter=oldString>"
|
||||
"def foo(arg = \"14\"):\n"
|
||||
|
||||
+114
-6
@@ -1882,11 +1882,29 @@ static void test_lfm2_parser(const std::string & template_path, bool detailed_de
|
||||
.expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
|
||||
.run();
|
||||
|
||||
// Python tool with multiline code in string
|
||||
// Python tool with multiline code in string: the \n in the literal decodes to a real
|
||||
// newline, emitted as a JSON \n escape (not a doubled backslash).
|
||||
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
|
||||
{ "python", R"#({"code": "def hello():\n print('hey')"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
// String escape sequences decode to their actual characters (newline + tab here),
|
||||
// so a "write a two line file" style call produces real line breaks, not literal "\n".
|
||||
tst.test("<|tool_call_start|>[python(code=\"First line\\nSecond line\\tindented\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "First line\nSecond line\tindented"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
// Escaped quotes inside a string argument survive the round-trip.
|
||||
tst.test("<|tool_call_start|>[python(code=\"print(\\\"hi\\\")\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "print(\"hi\")"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
@@ -1935,6 +1953,10 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
}
|
||||
})";
|
||||
|
||||
const char * const_schema = R"({
|
||||
"const": "42"
|
||||
})";
|
||||
|
||||
{
|
||||
// Qwen3.5 (basically same as Nemotron, but keeping separate tests just in case)
|
||||
auto tst = peg_tester("models/templates/Qwen3.5-4B.jinja", detailed_debug);
|
||||
@@ -2020,6 +2042,80 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=edit>\n"
|
||||
"<parameter=filename>\n"
|
||||
"foo.c\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=oldString>\n"
|
||||
"#iclunde\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=newString>\n"
|
||||
"#include\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
edit_tool
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\"}", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// a parameter value that itself ends in a newline (e.g. a source file with a
|
||||
// trailing newline). The structural delimiter is "\n</parameter>\n", so the value
|
||||
// "#include\n" renders as "...#include\n\n</parameter>\n". The trailing newline must
|
||||
// be preserved faithfully (no stripping), and the generated grammar must admit a
|
||||
// value ending on a delimiter prefix. Regression test for gbnf_excluding_pattern.
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=edit>\n"
|
||||
"<parameter=filename>\n"
|
||||
"foo.c\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=oldString>\n"
|
||||
"#iclunde\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=newString>\n"
|
||||
"#include\n"
|
||||
"\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
edit_tool
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\\n\"}", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
|
||||
// test code that starts with indent
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
" print(\"Hello, world!\")\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
python_tool
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "python", "{\"code\": \" print(\\\"Hello, world!\\\")\"}", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"I need to output the invoice details in JSON\n"
|
||||
"</think>\n\n"
|
||||
@@ -3196,18 +3292,16 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
tst.test(
|
||||
"<seed:tool_call>\n"
|
||||
"<function=edit>\n"
|
||||
"<parameter=filename>\n"
|
||||
"foo.cpp\n"
|
||||
"<parameter=filename>"
|
||||
"foo.cpp"
|
||||
"</parameter>\n"
|
||||
"<parameter=oldString>"
|
||||
"def foo(arg = \"14\"):\n"
|
||||
" return arg + \"bar\"\n"
|
||||
"\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=newString>"
|
||||
"def foo(arg = \"15\"):\n"
|
||||
" pass\n"
|
||||
"\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</seed:tool_call>")
|
||||
@@ -4927,6 +5021,20 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
auto tst = peg_tester("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run();
|
||||
|
||||
tst.test(
|
||||
"```json\n\"42\" \n```")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.json_schema(const_schema)
|
||||
.expect_content(R"("42")")
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"\"42\" \n")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.json_schema(const_schema)
|
||||
.expect_content(R"("42")")
|
||||
.run();
|
||||
|
||||
// Continuation tests
|
||||
tst.test("world!\nWhat's up?")
|
||||
.messages({ message_user, message_assist_prefill_content })
|
||||
|
||||
@@ -247,7 +247,9 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
llama_pos * new_n_past,
|
||||
mtmd_helper_post_decode_callback callback,
|
||||
void * user_data) {
|
||||
GGML_ASSERT(n_batch > 0);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
@@ -302,10 +304,23 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
int32_t ret = llama_decode(lctx, batch_embd_view);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode %s\n", name);
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
if (use_non_causal) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (callback != nullptr) {
|
||||
ret = callback(batch_embd_view, user_data);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("post-decode callback failed\n");
|
||||
if (use_non_causal) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
||||
|
||||
i_batch++;
|
||||
@@ -379,7 +394,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
|
||||
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
|
||||
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past, nullptr, nullptr);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode %s\n", name);
|
||||
llama_batch_free(text_batch);
|
||||
|
||||
@@ -91,6 +91,8 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
typedef int32_t (*mtmd_helper_post_decode_callback)(struct llama_batch batch, void * user_data);
|
||||
|
||||
// helper function to decode an image whose embeddings have already been calculated
|
||||
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
||||
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
||||
@@ -101,7 +103,9 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past);
|
||||
llama_pos * new_n_past,
|
||||
mtmd_helper_post_decode_callback callback,
|
||||
void * user_data);
|
||||
|
||||
//
|
||||
// video input helpers (requires ffmpeg/ffprobe installed on the system)
|
||||
|
||||
+8
-9
@@ -96,16 +96,15 @@ struct mtmd_image_tokens {
|
||||
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
|
||||
return (nx + 1) * ny + 2;
|
||||
}
|
||||
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
|
||||
if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
|
||||
return nx * ny;
|
||||
}
|
||||
uint32_t nz = batch_f32.entries.size();
|
||||
// TODO: simplify this by repeating the last frame until it fits the temporal merge
|
||||
if (nz % n_temporal_merge != 0) {
|
||||
nz = nz / n_temporal_merge + 1;
|
||||
} else {
|
||||
nz = nz / n_temporal_merge;
|
||||
if (n_temporal_merge > 1) {
|
||||
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
|
||||
// TODO: simplify this by repeating the last frame until it fits the temporal merge
|
||||
if (nz % n_temporal_merge != 0) {
|
||||
nz = nz / n_temporal_merge + 1;
|
||||
} else {
|
||||
nz = nz / n_temporal_merge;
|
||||
}
|
||||
}
|
||||
return nx * ny * nz;
|
||||
}
|
||||
|
||||
@@ -539,37 +539,6 @@ bool server_tokens::validate(const struct llama_context * ctx) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t server_tokens::process_chunk(
|
||||
llama_context * ctx,
|
||||
mtmd_context * mctx,
|
||||
size_t idx,
|
||||
llama_pos pos,
|
||||
int32_t seq_id,
|
||||
size_t & n_tokens_out) const {
|
||||
const auto & chunk = find_chunk(idx);
|
||||
const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
|
||||
? "image" : "audio";
|
||||
SRV_INF("processing %s...\n", name);
|
||||
int32_t n_batch = llama_n_batch(ctx);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
llama_pos new_n_past; // unused for now
|
||||
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
|
||||
chunk.get(),
|
||||
pos,
|
||||
seq_id,
|
||||
n_batch,
|
||||
true, // logits last
|
||||
&new_n_past);
|
||||
SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
|
||||
if (result != 0) {
|
||||
LOG_ERR("mtmd_helper_eval failed with status %d", result);
|
||||
n_tokens_out = 0;
|
||||
return result;
|
||||
}
|
||||
n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
|
||||
return 0;
|
||||
}
|
||||
|
||||
server_tokens server_tokens::clone() const {
|
||||
server_tokens res;
|
||||
res.has_mtmd = has_mtmd;
|
||||
|
||||
@@ -221,15 +221,6 @@ public:
|
||||
// make sure all text tokens are within the vocab range
|
||||
bool validate(const struct llama_context * ctx) const;
|
||||
|
||||
// encode and decode the image chunk
|
||||
int32_t process_chunk(
|
||||
llama_context * ctx,
|
||||
mtmd_context * mctx,
|
||||
size_t idx,
|
||||
llama_pos pos,
|
||||
int32_t seq_id,
|
||||
size_t & n_tokens_out) const;
|
||||
|
||||
server_tokens clone() const;
|
||||
};
|
||||
|
||||
|
||||
@@ -15,11 +15,6 @@
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING]
|
||||
#include "../../src/llama-ext.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cinttypes>
|
||||
@@ -81,7 +76,6 @@ struct server_slot {
|
||||
// multimodal
|
||||
mtmd_context * mctx = nullptr;
|
||||
mtmd::batch_ptr mbatch = nullptr;
|
||||
std::array<llama_context *, 2> mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context
|
||||
|
||||
// speculative decoding
|
||||
common_speculative * spec;
|
||||
@@ -244,15 +238,6 @@ struct server_slot {
|
||||
|
||||
// clear multimodal state
|
||||
mbatch.reset();
|
||||
mtgt[0] = ctx_tgt;
|
||||
mtgt[1] = nullptr;
|
||||
if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) {
|
||||
// TODO: in the future, figure out how to infuse target embeddings to the images
|
||||
// for now, we re-decode the same chunk in both ctx_tgt and ctx_dft
|
||||
// maybe we simply need to call `common_speculative_process()` ?
|
||||
// [TAG_MTMD_DRAFT_PROCESSING]
|
||||
mtgt[1] = ctx_dft;
|
||||
}
|
||||
}
|
||||
|
||||
void init_sampler() const {
|
||||
@@ -598,32 +583,38 @@ struct server_slot {
|
||||
int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) {
|
||||
GGML_ASSERT(mctx);
|
||||
const auto & input_tokens = task->tokens;
|
||||
auto & chunk = input_tokens.find_chunk(idx);
|
||||
const auto & chunk = input_tokens.find_chunk(idx);
|
||||
int32_t res = 0;
|
||||
|
||||
auto try_decode = [&]() -> int32_t {
|
||||
if (mbatch) {
|
||||
float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get());
|
||||
if (embd) {
|
||||
for (auto * lctx : mtgt) {
|
||||
if (lctx == nullptr) {
|
||||
continue;
|
||||
}
|
||||
llama_pos new_n_past; // unused for now
|
||||
res = mtmd_helper_decode_image_chunk(
|
||||
mctx,
|
||||
lctx,
|
||||
chunk.get(),
|
||||
embd,
|
||||
prompt.tokens.pos_next(),
|
||||
id,
|
||||
llama_n_batch(lctx),
|
||||
&new_n_past
|
||||
);
|
||||
if (res != 0) {
|
||||
SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
|
||||
return -1;
|
||||
void * cb_data = spec;
|
||||
static auto cb = [](llama_batch batch, void * user_data) {
|
||||
common_speculative * spec = static_cast<common_speculative *>(user_data);
|
||||
if (!common_speculative_process(spec, batch)) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
llama_pos new_n_past; // unused for now
|
||||
res = mtmd_helper_decode_image_chunk(
|
||||
mctx,
|
||||
ctx_tgt,
|
||||
chunk.get(),
|
||||
embd,
|
||||
prompt.tokens.pos_next(),
|
||||
id,
|
||||
llama_n_batch(ctx_tgt),
|
||||
&new_n_past,
|
||||
cb,
|
||||
cb_data
|
||||
);
|
||||
if (res != 0) {
|
||||
SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
|
||||
return -1;
|
||||
}
|
||||
n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
|
||||
return 0; // success
|
||||
@@ -636,7 +627,8 @@ struct server_slot {
|
||||
res = try_decode();
|
||||
if (res == 0) {
|
||||
return 0;
|
||||
} else if (res < 0) {
|
||||
}
|
||||
if (res < 0) {
|
||||
// fatal error
|
||||
return res;
|
||||
}
|
||||
@@ -3350,48 +3342,6 @@ private:
|
||||
// TODO: avoid restoring the draft context and re-evaluating the drafted tokens when not needed [TAG_SPEC_AVOID_DRAFT_REEVAL]
|
||||
// for now, always re-evaluate for simplicity
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4400925384
|
||||
//
|
||||
// | spec type | need re-eval |
|
||||
// | --- | --- |
|
||||
// | draft model | no | because the draft model does not use embeddings from the target
|
||||
// | MTP (std) | yes |
|
||||
// | MTP Gemma4 | no | because the KV cache is shared
|
||||
// | Eagle3 | yes |
|
||||
// | DFlash | yes | https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4405406982
|
||||
//
|
||||
// note: this logic is now moved in `common_speculative_process()`
|
||||
// keeping the sketch here until for a bit, until the logic is finalized
|
||||
//
|
||||
//if (ctx_dft) {
|
||||
// // TODO: update as needed for MTP, Eagle3, etc.
|
||||
// const bool need_tgt_embd = false;
|
||||
|
||||
// if (need_tgt_embd) {
|
||||
// llama_synchronize(ctx_tgt);
|
||||
// }
|
||||
|
||||
// // the logic here varies depending on the speculative decoding method
|
||||
// // - some draft contexts require embeddings from the target context, others don't
|
||||
// // - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
|
||||
// // TODO: extract this in a function ?
|
||||
// {
|
||||
// // TODO: hook the embeddings from the last target batch here
|
||||
// if (llama_model_has_encoder(model_dft.get())) {
|
||||
// //llama_encode(ctx_dft, ...);
|
||||
|
||||
// GGML_ABORT("not implemented yet\n");
|
||||
// }
|
||||
|
||||
// const int ret = llama_decode(ctx_dft.get(), batch_view);
|
||||
|
||||
// if (ret != 0) {
|
||||
// SRV_ERR("failed to decode draft batch, ret = %d\n", ret);
|
||||
|
||||
// // TODO: handle error
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
if (!common_speculative_process(spec.get(), batch_view)) {
|
||||
SRV_ERR("%s", "failed to process speculative batch\n");
|
||||
|
||||
|
||||
Generated
+4
-3
@@ -35,6 +35,7 @@
|
||||
"bits-ui": "2.18.1",
|
||||
"clsx": "2.1.1",
|
||||
"dexie": "4.4.3",
|
||||
"dompurify": "3.4.5",
|
||||
"eslint": "9.39.4",
|
||||
"eslint-config-prettier": "10.1.8",
|
||||
"eslint-plugin-storybook": "10.4.2",
|
||||
@@ -8651,9 +8652,9 @@
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/dompurify": {
|
||||
"version": "3.4.8",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.8.tgz",
|
||||
"integrity": "sha512-yb1cEmaOum7wFvOCSQxyfgVlv5D47Rc30iZWoMpbDIWTnJ6grDDQyu2KFJzB2k7u0pMuJcQ1zphH//fFnw2tjQ==",
|
||||
"version": "3.4.5",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.5.tgz",
|
||||
"integrity": "sha512-OrwIBKsdNSVEeubdJ1HBv/wNENRM9ytAVCv7YXt//A3vPdVMNuACRqK9mXCGCBW2ln7BT/A4X0jXHo2Gu89miA==",
|
||||
"dev": true,
|
||||
"license": "(MPL-2.0 OR Apache-2.0)",
|
||||
"optionalDependencies": {
|
||||
|
||||
@@ -54,6 +54,7 @@
|
||||
"bits-ui": "2.18.1",
|
||||
"clsx": "2.1.1",
|
||||
"dexie": "4.4.3",
|
||||
"dompurify": "3.4.5",
|
||||
"eslint": "9.39.4",
|
||||
"eslint-config-prettier": "10.1.8",
|
||||
"eslint-plugin-storybook": "10.4.2",
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
import { rehypeEnhanceCodeBlocks } from './plugins/rehype/enhance-code-blocks';
|
||||
import { rehypeEnhanceMermaidBlocks } from './plugins/rehype/enhance-mermaid-blocks';
|
||||
import { rehypeMermaidPre } from './plugins/rehype/mermaid-pre';
|
||||
import { rehypeSvgPre } from './plugins/rehype/svg-pre';
|
||||
import { rehypeEnhanceSvgBlocks } from './plugins/rehype/enhance-svg-blocks';
|
||||
import { rehypeResolveAttachmentImages } from './plugins/rehype/resolve-attachment-images';
|
||||
import { rehypeRtlSupport } from './plugins/rehype/rehype-rtl-support';
|
||||
import { remarkLiteralHtml } from './plugins/remark/literal-html';
|
||||
@@ -38,11 +40,26 @@
|
||||
DATA_ERROR_BOUND_ATTR,
|
||||
DATA_ERROR_HANDLED_ATTR,
|
||||
BOOL_TRUE_STRING,
|
||||
SETTINGS_KEYS
|
||||
SETTINGS_KEYS,
|
||||
MERMAID_WRAPPER_CLASS,
|
||||
MERMAID_BLOCK_CLASS,
|
||||
MERMAID_LANGUAGE,
|
||||
MERMAID_SYNTAX_ATTR,
|
||||
MERMAID_RENDERED_ATTR,
|
||||
SVG_WRAPPER_CLASS,
|
||||
SVG_BLOCK_CLASS,
|
||||
SVG_LANGUAGE,
|
||||
XML_LANGUAGE,
|
||||
SVG_TAG_PREFIX,
|
||||
SVG_SOURCE_ATTR,
|
||||
SVG_RENDERED_ATTR,
|
||||
SVG_INLINE_SHADOW_STYLE
|
||||
} from '$lib/constants';
|
||||
import { ColorMode, UrlProtocol } from '$lib/enums';
|
||||
import { FileTypeText } from '$lib/enums/files.enums';
|
||||
import { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } from '$lib/utils';
|
||||
import { sanitizeSvg } from '$lib/utils/sanitize-svg';
|
||||
import { mountSvgShadow } from '$lib/utils/svg-shadow';
|
||||
import '$styles/katex-custom.scss';
|
||||
import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
|
||||
import githubLightCss from 'highlight.js/styles/github.css?inline';
|
||||
@@ -77,11 +94,32 @@
|
||||
let renderedBlocks = $state<MarkdownBlock[]>([]);
|
||||
let unstableBlockHtml = $state('');
|
||||
let incompleteCodeBlock = $state<IncompleteCodeBlock | null>(null);
|
||||
const streamingSvgCode = $derived.by(() => {
|
||||
const block = incompleteCodeBlock;
|
||||
if (!block) return null;
|
||||
if (block.language === SVG_LANGUAGE) return block.code;
|
||||
if (block.language === XML_LANGUAGE && block.code.trimStart().startsWith(SVG_TAG_PREFIX))
|
||||
return block.code;
|
||||
return null;
|
||||
});
|
||||
const liveSvgHtml = $derived(streamingSvgCode !== null ? sanitizeSvg(streamingSvgCode) : '');
|
||||
let previewDialogOpen = $state(false);
|
||||
let previewCode = $state('');
|
||||
let previewLanguage = $state('text');
|
||||
let mermaidPreviewOpen = $state(false);
|
||||
let mermaidPreviewSvgHtml = $state('');
|
||||
let svgPreviewLive = $state(false);
|
||||
let streamingSvgHost = $state<HTMLDivElement | null>(null);
|
||||
|
||||
// While the zoom dialog is open on a streaming svg, mirror the live render into it
|
||||
$effect(() => {
|
||||
if (svgPreviewLive && liveSvgHtml) mermaidPreviewSvgHtml = liveSvgHtml;
|
||||
});
|
||||
|
||||
// Mount the streaming svg into its shadow host on every chunk so it renders live
|
||||
$effect(() => {
|
||||
if (streamingSvgHost) mountSvgShadow(streamingSvgHost, liveSvgHtml, SVG_INLINE_SHADOW_STYLE);
|
||||
});
|
||||
|
||||
let streamingCodeScrollContainer = $state<HTMLDivElement>();
|
||||
|
||||
@@ -124,8 +162,10 @@
|
||||
.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
|
||||
.use(rehypeEnhanceLinks) // Add target="_blank" to links
|
||||
.use(rehypeMermaidPre) // Convert mermaid blocks to <pre class="mermaid">
|
||||
.use(rehypeSvgPre) // Convert svg blocks to <pre class="svg-block">
|
||||
.use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions
|
||||
.use(rehypeEnhanceMermaidBlocks) // Wrap mermaid blocks with header and actions
|
||||
.use(rehypeEnhanceSvgBlocks) // Wrap svg blocks with header and actions
|
||||
.use(rehypeResolveAttachmentImages, { attachments })
|
||||
.use(rehypeRtlSupport) // Add bidirectional text support
|
||||
.use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string
|
||||
@@ -462,17 +502,19 @@
|
||||
const target = event.target as HTMLElement;
|
||||
|
||||
// Check if clicking on copy or preview button in mermaid block
|
||||
const copyBtn = target.closest('.mermaid-block-wrapper .copy-code-btn');
|
||||
const previewBtn = target.closest('.mermaid-block-wrapper .preview-code-btn');
|
||||
const copyBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .copy-code-btn`);
|
||||
const previewBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .preview-code-btn`);
|
||||
|
||||
if (copyBtn || previewBtn) {
|
||||
const wrapper = target.closest('.mermaid-block-wrapper');
|
||||
const wrapper = target.closest(`.${MERMAID_WRAPPER_CLASS}`);
|
||||
if (!wrapper) return;
|
||||
|
||||
const preElement = wrapper.querySelector<HTMLElement>('pre.mermaid[data-mermaid-syntax]');
|
||||
const preElement = wrapper.querySelector<HTMLElement>(
|
||||
`pre.${MERMAID_BLOCK_CLASS}[${MERMAID_SYNTAX_ATTR}]`
|
||||
);
|
||||
if (!preElement) return;
|
||||
|
||||
const mermaidSyntax = preElement.dataset.mermaidSyntax ?? '';
|
||||
const mermaidSyntax = preElement.getAttribute(MERMAID_SYNTAX_ATTR) ?? '';
|
||||
|
||||
if (copyBtn) {
|
||||
event.preventDefault();
|
||||
@@ -491,19 +533,70 @@
|
||||
const svg = preElement.querySelector('svg');
|
||||
if (!svg) return;
|
||||
mermaidPreviewSvgHtml = svg.outerHTML;
|
||||
svgPreviewLive = false;
|
||||
mermaidPreviewOpen = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if clicking on copy or preview button in svg block
|
||||
const svgCopyBtn = target.closest(`.${SVG_WRAPPER_CLASS} .copy-code-btn`);
|
||||
const svgPreviewBtn = target.closest(`.${SVG_WRAPPER_CLASS} .preview-code-btn`);
|
||||
|
||||
if (svgCopyBtn || svgPreviewBtn) {
|
||||
const wrapper = target.closest(`.${SVG_WRAPPER_CLASS}`);
|
||||
if (!wrapper) return;
|
||||
|
||||
const preElement = wrapper.querySelector<HTMLElement>(
|
||||
`pre.${SVG_BLOCK_CLASS}[${SVG_SOURCE_ATTR}]`
|
||||
);
|
||||
if (!preElement) return;
|
||||
|
||||
if (svgCopyBtn) {
|
||||
event.preventDefault();
|
||||
event.stopPropagation();
|
||||
try {
|
||||
await copyToClipboard(preElement.getAttribute(SVG_SOURCE_ATTR) ?? '');
|
||||
} catch (error) {
|
||||
console.error('Failed to copy svg source:', error);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (svgPreviewBtn) {
|
||||
event.preventDefault();
|
||||
event.stopPropagation();
|
||||
mermaidPreviewSvgHtml = sanitizeSvg(preElement.getAttribute(SVG_SOURCE_ATTR) ?? '');
|
||||
svgPreviewLive = false;
|
||||
mermaidPreviewOpen = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Open preview when clicking the svg block itself. A final block carries its
|
||||
// source, a streaming block does not and is mirrored live into the dialog.
|
||||
const svgEl = target.closest(`.${SVG_BLOCK_CLASS}`);
|
||||
if (svgEl) {
|
||||
const source = svgEl.getAttribute(SVG_SOURCE_ATTR);
|
||||
if (source !== null) {
|
||||
mermaidPreviewSvgHtml = sanitizeSvg(source);
|
||||
svgPreviewLive = false;
|
||||
} else {
|
||||
svgPreviewLive = true;
|
||||
}
|
||||
mermaidPreviewOpen = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise, open preview when clicking on the mermaid diagram itself
|
||||
const mermaidEl = target.closest('.mermaid');
|
||||
const mermaidEl = target.closest(`.${MERMAID_BLOCK_CLASS}`);
|
||||
if (!mermaidEl) return;
|
||||
|
||||
const svg = mermaidEl.querySelector('svg');
|
||||
if (!svg) return;
|
||||
|
||||
mermaidPreviewSvgHtml = svg.outerHTML;
|
||||
svgPreviewLive = false;
|
||||
mermaidPreviewOpen = true;
|
||||
}
|
||||
|
||||
@@ -515,6 +608,7 @@
|
||||
mermaidPreviewOpen = open;
|
||||
if (!open) {
|
||||
mermaidPreviewSvgHtml = '';
|
||||
svgPreviewLive = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -527,12 +621,14 @@
|
||||
async function renderMermaidDiagrams() {
|
||||
if (!containerRef) return;
|
||||
|
||||
const nodes = containerRef.querySelectorAll('pre.mermaid:not([data-mermaid-rendered])');
|
||||
const nodes = containerRef.querySelectorAll(
|
||||
`pre.${MERMAID_BLOCK_CLASS}:not([${MERMAID_RENDERED_ATTR}])`
|
||||
);
|
||||
if (nodes.length === 0) return;
|
||||
|
||||
// Mark nodes immediately to prevent duplicate renders if called again during streaming.
|
||||
// This avoids needing a guard that would block node discovery.
|
||||
nodes.forEach((node) => node.setAttribute('data-mermaid-rendered', 'true'));
|
||||
nodes.forEach((node) => node.setAttribute(MERMAID_RENDERED_ATTR, 'true'));
|
||||
|
||||
// Read mode before await so Svelte tracks it reactively.
|
||||
const isDark = mode.current === ColorMode.DARK;
|
||||
@@ -565,6 +661,34 @@
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Renders svg diagrams that haven't been rendered yet.
|
||||
* Sanitizes the source before injecting and marks each node so it renders once.
|
||||
* An empty sanitize result keeps the raw source as escaped text.
|
||||
*/
|
||||
function renderSvgDiagrams() {
|
||||
if (!containerRef) return;
|
||||
|
||||
const nodes = containerRef.querySelectorAll<HTMLElement>(
|
||||
`pre.${SVG_BLOCK_CLASS}:not([${SVG_RENDERED_ATTR}])`
|
||||
);
|
||||
if (nodes.length === 0) return;
|
||||
|
||||
nodes.forEach((node) => {
|
||||
node.setAttribute(SVG_RENDERED_ATTR, 'true');
|
||||
|
||||
const source = node.getAttribute(SVG_SOURCE_ATTR) ?? node.textContent ?? '';
|
||||
const clean = sanitizeSvg(source);
|
||||
|
||||
if (clean) {
|
||||
node.textContent = '';
|
||||
const host = document.createElement('div');
|
||||
node.appendChild(host);
|
||||
mountSvgShadow(host, clean, SVG_INLINE_SHADOW_STYLE);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles image load errors by replacing the image with a fallback UI.
|
||||
* Shows a placeholder with a link to open the image in a new tab.
|
||||
@@ -647,6 +771,7 @@
|
||||
setupCodeBlockActions();
|
||||
setupImageErrorHandlers();
|
||||
renderMermaidDiagrams();
|
||||
renderSvgDiagrams();
|
||||
}
|
||||
});
|
||||
|
||||
@@ -689,7 +814,7 @@
|
||||
{/if}
|
||||
|
||||
{#if incompleteCodeBlock}
|
||||
{#if incompleteCodeBlock.language === 'mermaid'}
|
||||
{#if incompleteCodeBlock.language === MERMAID_LANGUAGE}
|
||||
<div class="mermaid-block-wrapper streaming-mermaid-block">
|
||||
<div class="code-block-header">
|
||||
<span class="code-language">mermaid</span>
|
||||
@@ -705,6 +830,30 @@
|
||||
<span class="mermaid-loading-text">Generating diagram...</span>
|
||||
</div>
|
||||
</div>
|
||||
{:else if streamingSvgCode !== null}
|
||||
<div class="svg-block-wrapper streaming-svg-block">
|
||||
<div class="code-block-header">
|
||||
<span class="code-language">svg</span>
|
||||
<div class="code-block-actions">
|
||||
<ActionIconCopyToClipboard
|
||||
text={incompleteCodeBlock.code}
|
||||
canCopy={false}
|
||||
ariaLabel="Diagram incomplete"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
{#if liveSvgHtml}
|
||||
<div class="svg-scroll-container">
|
||||
<div class={SVG_BLOCK_CLASS}>
|
||||
<div bind:this={streamingSvgHost}></div>
|
||||
</div>
|
||||
</div>
|
||||
{:else}
|
||||
<div class="mermaid-loading-placeholder">
|
||||
<span class="mermaid-loading-text">Rendering svg...</span>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
{:else}
|
||||
<div class="code-block-wrapper streaming-code-block relative">
|
||||
<div class="code-block-header">
|
||||
|
||||
@@ -560,8 +560,9 @@ div.markdown-user-content :global(.table-wrapper) {
|
||||
border-color: var(--primary);
|
||||
}
|
||||
|
||||
/* Mermaid diagrams */
|
||||
.markdown-content :global(pre.mermaid) {
|
||||
/* Mermaid and svg blocks share the same block styling */
|
||||
.markdown-content :global(pre.mermaid),
|
||||
.markdown-content :global(.svg-block) {
|
||||
background: transparent;
|
||||
border: none;
|
||||
padding: 0;
|
||||
@@ -572,13 +573,25 @@ div.markdown-user-content :global(.table-wrapper) {
|
||||
position: relative;
|
||||
}
|
||||
|
||||
/* The svg block fills its flex container so the shadow host has a definite width to render into */
|
||||
.markdown-content :global(.svg-block) {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
/* Hide mermaid code text until rendered - prevents flash */
|
||||
.markdown-content :global(pre.mermaid:not([data-mermaid-rendered])),
|
||||
.markdown-content :global(pre.mermaid[data-mermaid-rendered]:not(:has(svg))) {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.markdown-content :global(pre.mermaid:hover) {
|
||||
/* Hide svg source until rendered - prevents flash. A rendered-but-unsanitized
|
||||
block (oversized source) keeps its raw text visible as a safe fallback. */
|
||||
.markdown-content :global(pre.svg-block:not([data-svg-rendered])) {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.markdown-content :global(pre.mermaid:hover),
|
||||
.markdown-content :global(.svg-block:hover) {
|
||||
opacity: 0.85;
|
||||
}
|
||||
|
||||
@@ -590,8 +603,9 @@ div.markdown-user-content :global(.table-wrapper) {
|
||||
padding: 3rem 1rem;
|
||||
}
|
||||
|
||||
/* Mermaid block wrapper - matches code block styling */
|
||||
.markdown-content :global(.mermaid-block-wrapper) {
|
||||
/* Diagram block wrapper - matches code block styling */
|
||||
.markdown-content :global(.mermaid-block-wrapper),
|
||||
.markdown-content :global(.svg-block-wrapper) {
|
||||
margin: 1.5rem 0;
|
||||
border-radius: 0.75rem;
|
||||
overflow: hidden;
|
||||
@@ -603,11 +617,13 @@ div.markdown-user-content :global(.table-wrapper) {
|
||||
max-height: var(--max-message-height);
|
||||
}
|
||||
|
||||
.markdown-content:global(.dark) :global(.mermaid-block-wrapper) {
|
||||
.markdown-content:global(.dark) :global(.mermaid-block-wrapper),
|
||||
.markdown-content:global(.dark) :global(.svg-block-wrapper) {
|
||||
border-color: color-mix(in oklch, var(--border) 20%, transparent);
|
||||
}
|
||||
|
||||
.markdown-content :global(.mermaid-scroll-container) {
|
||||
.markdown-content :global(.mermaid-scroll-container),
|
||||
.markdown-content :global(.svg-scroll-container) {
|
||||
min-height: 350px;
|
||||
max-height: var(--max-message-height);
|
||||
overflow-y: auto;
|
||||
@@ -618,17 +634,20 @@ div.markdown-user-content :global(.table-wrapper) {
|
||||
padding: 3rem 1rem 1rem;
|
||||
}
|
||||
|
||||
.full-height-code-blocks :global(.mermaid-block-wrapper) {
|
||||
.full-height-code-blocks :global(.mermaid-block-wrapper),
|
||||
.full-height-code-blocks :global(.svg-block-wrapper) {
|
||||
max-height: none;
|
||||
}
|
||||
|
||||
.full-height-code-blocks :global(.mermaid-scroll-container) {
|
||||
.full-height-code-blocks :global(.mermaid-scroll-container),
|
||||
.full-height-code-blocks :global(.svg-scroll-container) {
|
||||
max-height: none;
|
||||
overflow-y: visible;
|
||||
}
|
||||
|
||||
/* Mermaid block uses same header styling as code blocks */
|
||||
.markdown-content :global(.mermaid-block-wrapper .code-block-header) {
|
||||
/* Diagram block uses same header styling as code blocks */
|
||||
.markdown-content :global(.mermaid-block-wrapper .code-block-header),
|
||||
.markdown-content :global(.svg-block-wrapper .code-block-header) {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
@@ -640,14 +659,16 @@ div.markdown-user-content :global(.table-wrapper) {
|
||||
right: 0;
|
||||
}
|
||||
|
||||
.markdown-content :global(.mermaid-block-wrapper .code-block-actions) {
|
||||
.markdown-content :global(.mermaid-block-wrapper .code-block-actions),
|
||||
.markdown-content :global(.svg-block-wrapper .code-block-actions) {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
/* Mermaid pre element - remove default margins */
|
||||
.markdown-content :global(.mermaid-block-wrapper pre.mermaid) {
|
||||
/* Diagram pre element - remove default margins */
|
||||
.markdown-content :global(.mermaid-block-wrapper pre.mermaid),
|
||||
.markdown-content :global(.svg-block-wrapper pre.svg-block) {
|
||||
background: transparent;
|
||||
border: none;
|
||||
padding: 0;
|
||||
@@ -655,7 +676,6 @@ div.markdown-user-content :global(.table-wrapper) {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Mermaid SVG should be bigger */
|
||||
.markdown-content :global(.mermaid-block-wrapper pre.mermaid svg) {
|
||||
width: unset !important;
|
||||
height: auto;
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
*/
|
||||
|
||||
import { copyCodeToClipboard, copyToClipboard } from '$lib/utils';
|
||||
import { MERMAID_WRAPPER_CLASS, MERMAID_BLOCK_CLASS, MERMAID_SYNTAX_ATTR } from '$lib/constants';
|
||||
|
||||
export interface PreviewState {
|
||||
previewDialogOpen: boolean;
|
||||
@@ -106,17 +107,19 @@ export function createHandleMermaidClick(mermaidState: MermaidPreviewState) {
|
||||
const target = event.target as HTMLElement;
|
||||
|
||||
// Check if clicking on copy or preview button in mermaid block
|
||||
const copyBtn = target.closest('.mermaid-block-wrapper .copy-code-btn');
|
||||
const previewBtn = target.closest('.mermaid-block-wrapper .preview-code-btn');
|
||||
const copyBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .copy-code-btn`);
|
||||
const previewBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .preview-code-btn`);
|
||||
|
||||
if (copyBtn || previewBtn) {
|
||||
const wrapper = target.closest('.mermaid-block-wrapper');
|
||||
const wrapper = target.closest(`.${MERMAID_WRAPPER_CLASS}`);
|
||||
if (!wrapper) return;
|
||||
|
||||
const preElement = wrapper.querySelector<HTMLElement>('pre.mermaid[data-mermaid-syntax]');
|
||||
const preElement = wrapper.querySelector<HTMLElement>(
|
||||
`pre.${MERMAID_BLOCK_CLASS}[${MERMAID_SYNTAX_ATTR}]`
|
||||
);
|
||||
if (!preElement) return;
|
||||
|
||||
const mermaidSyntax = preElement.dataset.mermaidSyntax ?? '';
|
||||
const mermaidSyntax = preElement.getAttribute(MERMAID_SYNTAX_ATTR) ?? '';
|
||||
|
||||
if (copyBtn) {
|
||||
event.preventDefault();
|
||||
@@ -141,7 +144,7 @@ export function createHandleMermaidClick(mermaidState: MermaidPreviewState) {
|
||||
}
|
||||
|
||||
// Otherwise, open preview when clicking on the mermaid diagram itself
|
||||
const mermaidEl = target.closest('.mermaid');
|
||||
const mermaidEl = target.closest(`.${MERMAID_BLOCK_CLASS}`);
|
||||
if (!mermaidEl) return;
|
||||
|
||||
const svg = mermaidEl.querySelector('svg');
|
||||
|
||||
+18
-9
@@ -13,7 +13,14 @@
|
||||
import type { Plugin } from 'unified';
|
||||
import type { Root, Element, ElementContent } from 'hast';
|
||||
import { visit } from 'unist-util-visit';
|
||||
import { MERMAID_WRAPPER_CLASS, MERMAID_SCROLL_CONTAINER_CLASS } from '$lib/constants';
|
||||
import {
|
||||
MERMAID_WRAPPER_CLASS,
|
||||
MERMAID_SCROLL_CONTAINER_CLASS,
|
||||
MERMAID_BLOCK_CLASS,
|
||||
MERMAID_LANGUAGE,
|
||||
MERMAID_SYNTAX_ATTR,
|
||||
MERMAID_ID_ATTR
|
||||
} from '$lib/constants';
|
||||
import {
|
||||
createBlockHeader,
|
||||
createCopyButton,
|
||||
@@ -43,11 +50,13 @@ export const rehypeEnhanceMermaidBlocks: Plugin<[], Root> = () => {
|
||||
const className = node.properties?.className;
|
||||
if (!Array.isArray(className)) return;
|
||||
|
||||
const isMermaid = className.some((cls) => typeof cls === 'string' && cls === 'mermaid');
|
||||
const isMermaid = className.some(
|
||||
(cls) => typeof cls === 'string' && cls === MERMAID_BLOCK_CLASS
|
||||
);
|
||||
|
||||
if (!isMermaid) return;
|
||||
|
||||
const mermaidId = generateBlockId('mermaid', 'idxMermaidBlock');
|
||||
const mermaidId = generateBlockId(MERMAID_LANGUAGE, 'idxMermaidBlock');
|
||||
|
||||
// Extract the mermaid syntax (text content of the pre element)
|
||||
const diagramText = node.children
|
||||
@@ -60,22 +69,22 @@ export const rehypeEnhanceMermaidBlocks: Plugin<[], Root> = () => {
|
||||
// Store the mermaid syntax in data attribute for copy functionality
|
||||
node.properties = {
|
||||
...node.properties,
|
||||
'data-mermaid-syntax': diagramText,
|
||||
'data-mermaid-id': mermaidId
|
||||
[MERMAID_SYNTAX_ATTR]: diagramText,
|
||||
[MERMAID_ID_ATTR]: mermaidId
|
||||
};
|
||||
|
||||
const actions = [
|
||||
createCopyButton(mermaidId, 'data-mermaid-id', 'Copy mermaid syntax'),
|
||||
createPreviewButton(mermaidId, 'data-mermaid-id', 'Preview diagram')
|
||||
createCopyButton(mermaidId, MERMAID_ID_ATTR, 'Copy mermaid syntax'),
|
||||
createPreviewButton(mermaidId, MERMAID_ID_ATTR, 'Preview diagram')
|
||||
];
|
||||
|
||||
const header = createBlockHeader('mermaid', mermaidId, 'data-mermaid-id', actions);
|
||||
const header = createBlockHeader(MERMAID_LANGUAGE, mermaidId, MERMAID_ID_ATTR, actions);
|
||||
const wrapper = createWrapper(
|
||||
header,
|
||||
node,
|
||||
MERMAID_WRAPPER_CLASS,
|
||||
MERMAID_SCROLL_CONTAINER_CLASS,
|
||||
{ 'data-mermaid-id': mermaidId }
|
||||
{ [MERMAID_ID_ATTR]: mermaidId }
|
||||
);
|
||||
|
||||
// Replace pre with wrapper in parent
|
||||
|
||||
+80
@@ -0,0 +1,80 @@
|
||||
/**
|
||||
* Rehype plugin to enhance svg blocks with wrapper, header, and action buttons.
|
||||
*
|
||||
* Wraps <pre class="svg-block"> elements with a container that includes:
|
||||
* - Language label ("svg")
|
||||
* - Copy button (copies svg source to clipboard)
|
||||
* - Preview button (opens fullscreen preview dialog)
|
||||
*
|
||||
* Operates directly on the HAST tree and reuses the shared code-block builders.
|
||||
*/
|
||||
|
||||
import type { Plugin } from 'unified';
|
||||
import type { Root, Element, ElementContent } from 'hast';
|
||||
import { visit } from 'unist-util-visit';
|
||||
import {
|
||||
SVG_WRAPPER_CLASS,
|
||||
SVG_SCROLL_CONTAINER_CLASS,
|
||||
SVG_BLOCK_CLASS,
|
||||
SVG_LANGUAGE,
|
||||
SVG_SOURCE_ATTR,
|
||||
SVG_ID_ATTR
|
||||
} from '$lib/constants';
|
||||
import {
|
||||
createBlockHeader,
|
||||
createCopyButton,
|
||||
createPreviewButton,
|
||||
createWrapper,
|
||||
generateBlockId
|
||||
} from './code-block-utils';
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
idxSvgBlock?: number;
|
||||
}
|
||||
}
|
||||
|
||||
export const rehypeEnhanceSvgBlocks: Plugin<[], Root> = () => {
|
||||
return (tree: Root) => {
|
||||
visit(tree, 'element', (node: Element, index, parent) => {
|
||||
if (node.tagName !== 'pre' || !parent || index === undefined) return;
|
||||
|
||||
const className = node.properties?.className;
|
||||
if (!Array.isArray(className)) return;
|
||||
|
||||
const isSvg = className.some((cls) => typeof cls === 'string' && cls === SVG_BLOCK_CLASS);
|
||||
|
||||
if (!isSvg) return;
|
||||
|
||||
const svgId = generateBlockId(SVG_LANGUAGE, 'idxSvgBlock');
|
||||
|
||||
// Extract the svg source (text content of the pre element)
|
||||
const svgSource = node.children
|
||||
.map((child) => {
|
||||
if (child.type === 'text') return child.value;
|
||||
return '';
|
||||
})
|
||||
.join('');
|
||||
|
||||
// Store the svg source in data attribute for copy and render
|
||||
node.properties = {
|
||||
...node.properties,
|
||||
[SVG_SOURCE_ATTR]: svgSource,
|
||||
[SVG_ID_ATTR]: svgId
|
||||
};
|
||||
|
||||
const actions = [
|
||||
createCopyButton(svgId, SVG_ID_ATTR, 'Copy svg source'),
|
||||
createPreviewButton(svgId, SVG_ID_ATTR, 'Preview svg')
|
||||
];
|
||||
|
||||
const header = createBlockHeader(SVG_LANGUAGE, svgId, SVG_ID_ATTR, actions);
|
||||
const wrapper = createWrapper(header, node, SVG_WRAPPER_CLASS, SVG_SCROLL_CONTAINER_CLASS, {
|
||||
[SVG_ID_ATTR]: svgId
|
||||
});
|
||||
|
||||
// Replace pre with wrapper in parent
|
||||
(parent.children as ElementContent[])[index] = wrapper;
|
||||
});
|
||||
};
|
||||
};
|
||||
+4
-64
@@ -1,67 +1,7 @@
|
||||
import type { Plugin } from 'unified';
|
||||
import type { Root, Element, ElementContent, Text } from 'hast';
|
||||
import { visit } from 'unist-util-visit';
|
||||
import { createPreTransform } from './pre-transform';
|
||||
import { MERMAID_BLOCK_CLASS, MERMAID_LANGUAGE } from '$lib/constants';
|
||||
|
||||
/**
|
||||
* Recursively extracts all text content from a HAST node.
|
||||
* Handles nested elements (e.g., span wrappers from syntax highlighting).
|
||||
* Converts mermaid code blocks to <pre class="mermaid"> for client-side rendering.
|
||||
*/
|
||||
function extractText(node: ElementContent): string {
|
||||
if (node.type === 'text') return node.value;
|
||||
if (node.type === 'element') {
|
||||
return (node.children ?? []).map(extractText).join('');
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Rehype plugin to convert mermaid code blocks to <pre class="mermaid"> elements.
|
||||
*
|
||||
* Transforms:
|
||||
* <pre><code class="language-mermaid">graph TD; A-->B</code></pre>
|
||||
* into:
|
||||
* <pre class="mermaid">graph TD; A-->B</pre>
|
||||
*
|
||||
* The mermaid library renders these client-side via mermaid.run().
|
||||
*
|
||||
* Must run BEFORE rehypeEnhanceCodeBlocks so mermaid blocks are not wrapped
|
||||
* with code block headers/buttons (they have no <code> child, so they're skipped).
|
||||
*/
|
||||
export const rehypeMermaidPre: Plugin<[], Root> = () => {
|
||||
return (tree: Root) => {
|
||||
visit(tree, 'element', (node: Element, index, parent) => {
|
||||
if (node.tagName !== 'pre' || !parent || index === undefined) return;
|
||||
|
||||
const codeElement = node.children.find(
|
||||
(child): child is Element => child.type === 'element' && child.tagName === 'code'
|
||||
);
|
||||
|
||||
if (!codeElement) return;
|
||||
|
||||
const className = codeElement.properties?.className;
|
||||
if (!Array.isArray(className)) return;
|
||||
|
||||
const isMermaid = className.some(
|
||||
(cls) => typeof cls === 'string' && cls === 'language-mermaid'
|
||||
);
|
||||
|
||||
if (!isMermaid) return;
|
||||
|
||||
// Recursively extract text to handle nested spans from syntax highlighting
|
||||
const diagramText = codeElement.children.map(extractText).join('').trim();
|
||||
|
||||
if (!diagramText) return;
|
||||
|
||||
const mermaidPre: Element = {
|
||||
type: 'element',
|
||||
tagName: 'pre',
|
||||
properties: {
|
||||
className: ['mermaid']
|
||||
},
|
||||
children: [{ type: 'text', value: diagramText } as Text]
|
||||
};
|
||||
|
||||
(parent.children as ElementContent[])[index] = mermaidPre;
|
||||
});
|
||||
};
|
||||
};
|
||||
export const rehypeMermaidPre = createPreTransform(MERMAID_LANGUAGE, MERMAID_BLOCK_CLASS);
|
||||
|
||||
+79
@@ -0,0 +1,79 @@
|
||||
import type { Plugin } from 'unified';
|
||||
import type { Root, Element, ElementContent, Text } from 'hast';
|
||||
import { visit } from 'unist-util-visit';
|
||||
|
||||
/**
|
||||
* Recursively extracts all text content from a HAST node.
|
||||
* Handles nested elements (e.g., span wrappers from syntax highlighting).
|
||||
*/
|
||||
function extractText(node: ElementContent): string {
|
||||
if (node.type === 'text') return node.value;
|
||||
if (node.type === 'element') {
|
||||
return (node.children ?? []).map(extractText).join('');
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a rehype plugin that converts <pre><code class="language-{language}">
|
||||
* blocks into <pre class="{targetClass}"> elements carrying the raw text.
|
||||
*
|
||||
* Accepts one or more source languages, and an optional contentGuard that
|
||||
* receives the trimmed text and decides whether the block qualifies. The guard
|
||||
* lets a shared fence language be claimed only when its content matches, e.g.
|
||||
* an xml block is converted to svg only when it starts with <svg.
|
||||
*
|
||||
* The result has no <code> child, so rehypeEnhanceCodeBlocks skips it. Rendering
|
||||
* happens client-side, so no markup is injected at this stage. Must run BEFORE
|
||||
* rehypeEnhanceCodeBlocks.
|
||||
*/
|
||||
export function createPreTransform(
|
||||
languages: string | string[],
|
||||
targetClass: string,
|
||||
contentGuard?: (text: string) => boolean
|
||||
): Plugin<[], Root> {
|
||||
const codeClasses = (Array.isArray(languages) ? languages : [languages]).map(
|
||||
(language) => `language-${language}`
|
||||
);
|
||||
|
||||
return () => {
|
||||
return (tree: Root) => {
|
||||
visit(tree, 'element', (node: Element, index, parent) => {
|
||||
if (node.tagName !== 'pre' || !parent || index === undefined) return;
|
||||
|
||||
const codeElement = node.children.find(
|
||||
(child): child is Element => child.type === 'element' && child.tagName === 'code'
|
||||
);
|
||||
|
||||
if (!codeElement) return;
|
||||
|
||||
const className = codeElement.properties?.className;
|
||||
if (!Array.isArray(className)) return;
|
||||
|
||||
const matches = className.some(
|
||||
(cls) => typeof cls === 'string' && codeClasses.includes(cls)
|
||||
);
|
||||
|
||||
if (!matches) return;
|
||||
|
||||
// Recursively extract text to handle nested spans from syntax highlighting
|
||||
const text = codeElement.children.map(extractText).join('').trim();
|
||||
|
||||
if (!text) return;
|
||||
|
||||
if (contentGuard && !contentGuard(text)) return;
|
||||
|
||||
const pre: Element = {
|
||||
type: 'element',
|
||||
tagName: 'pre',
|
||||
properties: {
|
||||
className: [targetClass]
|
||||
},
|
||||
children: [{ type: 'text', value: text } as Text]
|
||||
};
|
||||
|
||||
(parent.children as ElementContent[])[index] = pre;
|
||||
});
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
import { createPreTransform } from './pre-transform';
|
||||
import { SVG_BLOCK_CLASS, SVG_LANGUAGE, XML_LANGUAGE, SVG_TAG_PREFIX } from '$lib/constants';
|
||||
|
||||
/**
|
||||
* Converts svg code blocks to <pre class="svg-block"> for client-side rendering.
|
||||
* Also claims xml blocks whose content starts with <svg, since models often emit
|
||||
* svg inside an xml fence.
|
||||
*/
|
||||
export const rehypeSvgPre = createPreTransform(
|
||||
[SVG_LANGUAGE, XML_LANGUAGE],
|
||||
SVG_BLOCK_CLASS,
|
||||
(text) => text.startsWith(SVG_TAG_PREFIX)
|
||||
);
|
||||
@@ -1,5 +1,7 @@
|
||||
<script lang="ts">
|
||||
import MermaidPreviewControls from './MermaidPreviewControls.svelte';
|
||||
import { mountSvgShadow } from '$lib/utils/svg-shadow';
|
||||
import { SVG_DIALOG_SHADOW_STYLE } from '$lib/constants';
|
||||
|
||||
interface Props {
|
||||
svgHtml: string;
|
||||
@@ -7,6 +9,13 @@
|
||||
|
||||
let { svgHtml }: Props = $props();
|
||||
|
||||
let svgHost = $state<HTMLDivElement | null>(null);
|
||||
|
||||
// Re-mount on every svgHtml change so a live streaming svg keeps rendering while zoomed
|
||||
$effect(() => {
|
||||
if (svgHost) mountSvgShadow(svgHost, svgHtml, SVG_DIALOG_SHADOW_STYLE);
|
||||
});
|
||||
|
||||
// Zoom and pan state
|
||||
let scale = $state(1);
|
||||
let translateX = $state(0);
|
||||
@@ -99,8 +108,7 @@
|
||||
onpointerup={handlePointerUp}
|
||||
onpointerleave={handlePointerUp}
|
||||
>
|
||||
<!-- eslint-disable-next-line no-at-html-tags -->
|
||||
{@html svgHtml}
|
||||
<div bind:this={svgHost}></div>
|
||||
</div>
|
||||
|
||||
<MermaidPreviewControls
|
||||
@@ -111,16 +119,3 @@
|
||||
onResetView={resetView}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<style lang="postcss" scoped>
|
||||
/* Styles for SVGs rendered via {@html} — no Tailwind class can target child elements */
|
||||
.mermaid-preview-diagram :global(svg) {
|
||||
min-height: min(50vh, 12rem);
|
||||
min-width: min(80vw, 20rem);
|
||||
max-width: none !important;
|
||||
max-height: none !important;
|
||||
height: auto !important;
|
||||
width: auto !important;
|
||||
display: block;
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -29,6 +29,7 @@ export * from './latex-protection';
|
||||
export * from './literal-html';
|
||||
export * from './markdown';
|
||||
export * from './mermaid-blocks';
|
||||
export * from './svg-blocks';
|
||||
export * from './max-bundle-size';
|
||||
export * from './mcp';
|
||||
export * from './mcp-form';
|
||||
|
||||
@@ -1,2 +1,9 @@
|
||||
export const MERMAID_WRAPPER_CLASS = 'mermaid-block-wrapper';
|
||||
export const MERMAID_SCROLL_CONTAINER_CLASS = 'mermaid-scroll-container';
|
||||
export const MERMAID_BLOCK_CLASS = 'mermaid';
|
||||
|
||||
export const MERMAID_LANGUAGE = 'mermaid';
|
||||
|
||||
export const MERMAID_SYNTAX_ATTR = 'data-mermaid-syntax';
|
||||
export const MERMAID_ID_ATTR = 'data-mermaid-id';
|
||||
export const MERMAID_RENDERED_ATTR = 'data-mermaid-rendered';
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
export const SVG_WRAPPER_CLASS = 'svg-block-wrapper';
|
||||
export const SVG_SCROLL_CONTAINER_CLASS = 'svg-scroll-container';
|
||||
export const SVG_BLOCK_CLASS = 'svg-block';
|
||||
|
||||
export const SVG_LANGUAGE = 'svg';
|
||||
export const XML_LANGUAGE = 'xml';
|
||||
export const SVG_TAG_PREFIX = '<svg';
|
||||
|
||||
export const SVG_SOURCE_ATTR = 'data-svg-source';
|
||||
export const SVG_ID_ATTR = 'data-svg-id';
|
||||
export const SVG_RENDERED_ATTR = 'data-svg-rendered';
|
||||
|
||||
/**
|
||||
* Hard size ceiling for a single inline svg block.
|
||||
* Above this the source is left as raw text instead of being rendered.
|
||||
*/
|
||||
export const SVG_MAX_BYTES = 256 * 1024;
|
||||
|
||||
/**
|
||||
* DOMPurify config for untrusted svg coming from model output.
|
||||
*
|
||||
* foreignObject and script stay forbidden unconditionally, they are the only
|
||||
* inline svg vectors that execute arbitrary html or js. Everything else is
|
||||
* allowed for maximum rendering compatibility: href and xlink:href stay so
|
||||
* use, image, a and animateMotion work, and DOMPurify still neutralizes
|
||||
* javascript: and data: uri schemes natively. External resource refs are
|
||||
* allowed by design on a local first tool, the user browser fetches them.
|
||||
*
|
||||
* The sanitized svg is always mounted inside a shadow root (see svg-shadow),
|
||||
* so an author <style> stays scoped to that root and can not reach the page.
|
||||
*/
|
||||
export const SVG_SANITIZE_CONFIG = {
|
||||
USE_PROFILES: { svg: true, svgFilters: true },
|
||||
FORBID_TAGS: ['foreignObject', 'script']
|
||||
};
|
||||
|
||||
/**
|
||||
* Shadow root style for an inline svg block. Mirrors the centered, padded
|
||||
* sizing the light dom used before the svg moved behind a shadow boundary.
|
||||
*/
|
||||
export const SVG_INLINE_SHADOW_STYLE =
|
||||
':host{display:block;width:100%;text-align:center}svg{display:block;margin:0 auto;width:auto;height:auto;max-width:100%;max-height:70vh;min-height:8rem;padding:3rem 1rem}';
|
||||
|
||||
/**
|
||||
* Shadow root style for the zoom dialog svg. Lets the svg grow past its
|
||||
* intrinsic size so pan and zoom have room to work.
|
||||
*/
|
||||
export const SVG_DIALOG_SHADOW_STYLE =
|
||||
':host{display:inline-block}svg{min-height:min(50vh,12rem);min-width:min(80vw,20rem);max-width:none;max-height:none;height:auto;width:auto;display:block}';
|
||||
@@ -0,0 +1,22 @@
|
||||
import DOMPurify from 'dompurify';
|
||||
import { SVG_MAX_BYTES, SVG_SANITIZE_CONFIG, SVG_TAG_PREFIX } from '$lib/constants';
|
||||
|
||||
/**
|
||||
* Sanitizes a raw svg string for safe inline rendering.
|
||||
* Returns the cleaned svg markup, or an empty string when the input is not a
|
||||
* usable svg, exceeds the size ceiling, or sanitizes to nothing. An empty
|
||||
* return tells the caller to keep the raw code block instead of rendering.
|
||||
*/
|
||||
export function sanitizeSvg(source: string): string {
|
||||
const trimmed = source.trim();
|
||||
|
||||
if (!trimmed || trimmed.length > SVG_MAX_BYTES) return '';
|
||||
|
||||
if (!trimmed.startsWith(SVG_TAG_PREFIX)) return '';
|
||||
|
||||
const clean = DOMPurify.sanitize(trimmed, SVG_SANITIZE_CONFIG) as unknown as string;
|
||||
|
||||
if (!clean || !clean.includes(SVG_TAG_PREFIX)) return '';
|
||||
|
||||
return clean;
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
/**
|
||||
* Mounts svg markup inside an open shadow root on the host element.
|
||||
* The shadow boundary scopes the svg <style> and its animations to the host,
|
||||
* so model authored css can not reach the surrounding page. The caller passes
|
||||
* markup that is already sanitized, this only isolates and sizes it.
|
||||
*/
|
||||
export function mountSvgShadow(host: HTMLElement, markup: string, style: string): void {
|
||||
const root = host.shadowRoot ?? host.attachShadow({ mode: 'open' });
|
||||
root.innerHTML = markup ? `<style>${style}</style>${markup}` : '';
|
||||
}
|
||||
Reference in New Issue
Block a user