mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-30 17:47:40 +02:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 091d98e2c5 | |||
| df27d80ae3 |
@@ -33,7 +33,6 @@ FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
|
||||
libglvnd0 libgl1 libglx0 libegl1 libgles2 \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -1418,6 +1418,7 @@ jobs:
|
||||
echo "FIXME: test on devices"
|
||||
|
||||
openEuler-latest-cmake-cann:
|
||||
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
|
||||
@@ -130,7 +130,6 @@ poetry.toml
|
||||
# Local scripts
|
||||
/run-vim.sh
|
||||
/run-chat.sh
|
||||
/run-spec.sh
|
||||
/.ccache/
|
||||
|
||||
# IDE
|
||||
|
||||
@@ -482,6 +482,21 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||
|
||||
</details>
|
||||
|
||||
## [`llama-run`](tools/run)
|
||||
|
||||
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
||||
|
||||
- <details>
|
||||
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
|
||||
|
||||
```bash
|
||||
llama-run granite-code
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
[^3]: [RamaLama](https://github.com/containers/ramalama)
|
||||
|
||||
## [`llama-simple`](examples/simple)
|
||||
|
||||
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
|
||||
@@ -585,6 +600,7 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
||||
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
||||
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
||||
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
||||
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
|
||||
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
||||
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
|
||||
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
|
||||
|
||||
+51
-127
@@ -679,6 +679,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
"llama-quantize",
|
||||
"llama-qwen2vl-cli",
|
||||
"llama-retrieval",
|
||||
"llama-run",
|
||||
"llama-save-load-state",
|
||||
"llama-server",
|
||||
"llama-simple",
|
||||
@@ -853,54 +854,6 @@ bool common_arg_utils::is_autoy(const std::string & value) {
|
||||
return value == "auto" || value == "-1";
|
||||
}
|
||||
|
||||
// Simple CSV parser that handles quoted fields and escaped quotes
|
||||
// example:
|
||||
// input: value1,"value, with, commas","value with ""escaped"" quotes",value4
|
||||
// output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
|
||||
static std::vector<std::string> parse_csv_row(const std::string& input) {
|
||||
std::vector<std::string> fields;
|
||||
std::string field;
|
||||
bool in_quotes = false;
|
||||
|
||||
for (size_t i = 0; i < input.length(); ++i) {
|
||||
char ch = input[i];
|
||||
|
||||
if (ch == '"') {
|
||||
if (!in_quotes) {
|
||||
// start of quoted field (only valid if at beginning of field)
|
||||
if (!field.empty()) {
|
||||
// quote appeared in middle of unquoted field, treat as literal
|
||||
field += '"';
|
||||
} else {
|
||||
in_quotes = true; // start
|
||||
}
|
||||
} else {
|
||||
if (i + 1 < input.length() && input[i + 1] == '"') {
|
||||
// escaped quote: ""
|
||||
field += '"';
|
||||
++i; // skip the next quote
|
||||
} else {
|
||||
in_quotes = false; // end
|
||||
}
|
||||
}
|
||||
} else if (ch == ',') {
|
||||
if (in_quotes) {
|
||||
field += ',';
|
||||
} else {
|
||||
fields.push_back(std::move(field));
|
||||
field.clear();
|
||||
}
|
||||
} else {
|
||||
field += ch;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last field
|
||||
fields.push_back(std::move(field));
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
// per-example default params
|
||||
// we define here to make sure it's included in llama-gen-docs
|
||||
@@ -1297,7 +1250,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--in-file"}, "FNAME",
|
||||
"an input file (use comma-separated values to specify multiple files)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
std::ifstream file(item);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
||||
@@ -1444,7 +1397,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, bool value) {
|
||||
params.warmup = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
|
||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
||||
add_opt(common_arg(
|
||||
{"--spm-infill"},
|
||||
string_format(
|
||||
@@ -1760,7 +1713,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
|
||||
add_opt(common_arg(
|
||||
{"--attention"}, "{causal,non-causal}",
|
||||
"attention type for embeddings, use model default if unspecified",
|
||||
@@ -2049,7 +2002,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--image", "--audio"}, "FILE",
|
||||
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
params.image.emplace_back(item);
|
||||
}
|
||||
}
|
||||
@@ -2088,22 +2041,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"--mmap"},
|
||||
{"--no-mmap"},
|
||||
string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
||||
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.use_mmap = value;
|
||||
if (value) {
|
||||
params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_MMAP"));
|
||||
add_opt(common_arg(
|
||||
{"-dio", "--direct-io"},
|
||||
{"-ndio", "--no-direct-io"},
|
||||
string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.use_direct_io = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_DIO"));
|
||||
add_opt(common_arg(
|
||||
{"--numa"}, "TYPE",
|
||||
"attempt optimizations that help on some NUMA systems\n"
|
||||
@@ -2255,7 +2197,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
std::vector<std::string> split_arg{ it, {} };
|
||||
if (split_arg.size() >= llama_max_devices()) {
|
||||
throw std::invalid_argument(
|
||||
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
||||
string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
||||
);
|
||||
}
|
||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||
@@ -2295,28 +2237,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_env("LLAMA_ARG_FIT"));
|
||||
add_opt(common_arg(
|
||||
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
||||
string_format("target margin per device for --fit, comma-separated list of values, "
|
||||
"single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::string arg_next = value;
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{ R"([,/]+)" };
|
||||
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
||||
std::vector<std::string> split_arg{ it, {} };
|
||||
if (split_arg.size() >= llama_max_devices()) {
|
||||
throw std::invalid_argument(
|
||||
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
||||
);
|
||||
}
|
||||
if (split_arg.size() == 1) {
|
||||
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
|
||||
return;
|
||||
}
|
||||
for (size_t i = 0; i < split_arg.size(); i++) {
|
||||
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
|
||||
}
|
||||
{ "-fitt", "--fit-target" }, "MiB",
|
||||
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
|
||||
[](common_params & params, int value) {
|
||||
params.fit_params_target = value * size_t(1024*1024);
|
||||
}
|
||||
).set_env("LLAMA_ARG_FIT_TARGET"));
|
||||
add_opt(common_arg(
|
||||
@@ -2335,12 +2259,37 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
||||
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
|
||||
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
|
||||
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
|
||||
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
|
||||
std::vector<std::string> kv_overrides;
|
||||
|
||||
std::string current;
|
||||
bool escaping = false;
|
||||
|
||||
for (const char c : value) {
|
||||
if (escaping) {
|
||||
current.push_back(c);
|
||||
escaping = false;
|
||||
} else if (c == '\\') {
|
||||
escaping = true;
|
||||
} else if (c == ',') {
|
||||
kv_overrides.push_back(current);
|
||||
current.clear();
|
||||
} else {
|
||||
current.push_back(c);
|
||||
}
|
||||
}
|
||||
|
||||
if (escaping) {
|
||||
current.push_back('\\');
|
||||
}
|
||||
|
||||
kv_overrides.push_back(current);
|
||||
|
||||
for (const auto & kv_override : kv_overrides) {
|
||||
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
||||
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2357,7 +2306,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--lora"}, "FNAME",
|
||||
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
||||
}
|
||||
}
|
||||
@@ -2368,7 +2317,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
||||
"note: use comma-separated values",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
auto parts = string_split<std::string>(item, ':');
|
||||
if (parts.size() != 2) {
|
||||
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
||||
@@ -2382,7 +2331,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--control-vector"}, "FNAME",
|
||||
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
params.control_vectors.push_back({ 1.0f, item, });
|
||||
}
|
||||
}
|
||||
@@ -2392,7 +2341,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"add a control vector with user defined scaling SCALE\n"
|
||||
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
auto parts = string_split<std::string>(item, ':');
|
||||
if (parts.size() != 2) {
|
||||
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
||||
@@ -2490,7 +2439,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--context-file"}, "FNAME",
|
||||
"file to load context from (use comma-separated values to specify multiple files)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
std::ifstream file(item, std::ios::binary);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
||||
@@ -2637,7 +2586,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, int value) {
|
||||
params.embd_normalize = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
||||
add_opt(common_arg(
|
||||
{"--embd-output-format"}, "FORMAT",
|
||||
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
||||
@@ -2715,7 +2664,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.embedding = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||
add_opt(common_arg(
|
||||
{"--rerank", "--reranking"},
|
||||
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
||||
@@ -2726,13 +2675,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
||||
add_opt(common_arg(
|
||||
{"--api-key"}, "KEY",
|
||||
"API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
|
||||
"API key to use for authentication (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & key : parse_csv_row(value)) {
|
||||
if (!key.empty()) {
|
||||
params.api_keys.push_back(key);
|
||||
}
|
||||
}
|
||||
params.api_keys.push_back(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
||||
add_opt(common_arg(
|
||||
@@ -2746,7 +2691,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
std::string key;
|
||||
while (std::getline(key_file, key)) {
|
||||
if (!key.empty()) {
|
||||
params.api_keys.push_back(key);
|
||||
params.api_keys.push_back(key);
|
||||
}
|
||||
}
|
||||
key_file.close();
|
||||
@@ -2768,7 +2713,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template-kwargs"}, "STRING",
|
||||
"sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
|
||||
string_format("sets additional params for the json template parser"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
auto parsed = json::parse(value);
|
||||
for (const auto & item : parsed.items()) {
|
||||
@@ -3406,27 +3351,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg(
|
||||
{"--save-logits"},
|
||||
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.save_logits = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
||||
add_opt(common_arg(
|
||||
{"--logits-output-dir"}, "PATH",
|
||||
string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.logits_output_dir = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
||||
add_opt(common_arg(
|
||||
{"--tensor-filter"}, "REGEX",
|
||||
"filter tensor names for debug output (regex pattern, can be specified multiple times)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.tensor_filter.push_back(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
||||
|
||||
// presets
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -129,3 +129,11 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
struct common_remote_params {
|
||||
std::vector<std::string> headers;
|
||||
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
||||
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
||||
};
|
||||
// get remote file content, returns <http_code, raw_response_body>
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
||||
|
||||
+1
-2
@@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
}
|
||||
|
||||
@@ -1366,7 +1366,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_direct_io = params.use_direct_io;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||
|
||||
+7
-16
@@ -80,7 +80,6 @@ int32_t cpu_get_num_math();
|
||||
//
|
||||
|
||||
enum llama_example {
|
||||
LLAMA_EXAMPLE_DEBUG,
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_COMPLETION,
|
||||
@@ -332,14 +331,12 @@ struct common_params {
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
// margin per device in bytes for fitting parameters to free memory:
|
||||
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
@@ -375,11 +372,6 @@ struct common_params {
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||
|
||||
// llama-debug specific options
|
||||
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
|
||||
bool save_logits = false; // whether to save logits to files // NOLINT
|
||||
std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
|
||||
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
@@ -430,8 +422,7 @@ struct common_params {
|
||||
bool kv_unified = false; // enable unified KV cache
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool use_mmap = true; // enable mmap to use filesystem cache
|
||||
bool use_direct_io = true; // read from disk without buffering for faster model loading
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
|
||||
+31
-55
@@ -308,8 +308,7 @@ static bool common_download_head(CURL * curl,
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
const std::string & bearer_token) {
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
for (int i = 0; i < max_attempts; ++i) {
|
||||
@@ -331,11 +330,6 @@ static bool common_download_file_single_online(const std::string & url,
|
||||
common_load_model_from_url_headers headers;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
curl_slist_ptr http_headers;
|
||||
|
||||
for (const auto & h : custom_headers) {
|
||||
std::string s = h.first + ": " + h.second;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str());
|
||||
}
|
||||
const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
|
||||
if (!was_perform_successful) {
|
||||
head_request_ok = false;
|
||||
@@ -460,10 +454,8 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
||||
}
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
|
||||
for (const auto & header : params.headers) {
|
||||
std::string header_ = header.first + ": " + header.second;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str());
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
||||
}
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
@@ -627,8 +619,7 @@ static bool common_pull_file(httplib::Client & cli,
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
const std::string & bearer_token) {
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
|
||||
@@ -638,9 +629,6 @@ static bool common_download_file_single_online(const std::string & url,
|
||||
if (!bearer_token.empty()) {
|
||||
default_headers.insert({"Authorization", "Bearer " + bearer_token});
|
||||
}
|
||||
for (const auto & h : custom_headers) {
|
||||
default_headers.emplace(h.first, h.second);
|
||||
}
|
||||
cli.set_default_headers(default_headers);
|
||||
|
||||
const bool file_exists = std::filesystem::exists(path);
|
||||
@@ -746,9 +734,13 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
||||
auto [cli, parts] = common_http_client(url);
|
||||
|
||||
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
|
||||
|
||||
for (const auto & header : params.headers) {
|
||||
headers.emplace(header.first, header.second);
|
||||
size_t pos = header.find(':');
|
||||
if (pos != std::string::npos) {
|
||||
headers.emplace(header.substr(0, pos), header.substr(pos + 1));
|
||||
} else {
|
||||
headers.emplace(header, "");
|
||||
}
|
||||
}
|
||||
|
||||
if (params.timeout > 0) {
|
||||
@@ -780,10 +772,9 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
||||
static bool common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers) {
|
||||
bool offline) {
|
||||
if (!offline) {
|
||||
return common_download_file_single_online(url, path, bearer_token, headers);
|
||||
return common_download_file_single_online(url, path, bearer_token);
|
||||
}
|
||||
|
||||
if (!std::filesystem::exists(path)) {
|
||||
@@ -797,24 +788,13 @@ static bool common_download_file_single(const std::string & url,
|
||||
|
||||
// download multiple files from remote URLs to local paths
|
||||
// the input is a vector of pairs <url, path>
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers) {
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
||||
// Prepare download in parallel
|
||||
std::vector<std::future<bool>> futures_download;
|
||||
futures_download.reserve(urls.size());
|
||||
|
||||
for (auto const & item : urls) {
|
||||
futures_download.push_back(
|
||||
std::async(
|
||||
std::launch::async,
|
||||
[&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token, offline, headers);
|
||||
},
|
||||
item
|
||||
)
|
||||
);
|
||||
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
||||
}, item));
|
||||
}
|
||||
|
||||
// Wait for all downloads to complete
|
||||
@@ -827,17 +807,17 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
||||
return true;
|
||||
}
|
||||
|
||||
bool common_download_model(const common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers) {
|
||||
bool common_download_model(
|
||||
const common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline) {
|
||||
// Basic validation of the model.url
|
||||
if (model.url.empty()) {
|
||||
LOG_ERR("%s: invalid model url\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token, offline, headers)) {
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -896,16 +876,13 @@ bool common_download_model(const common_params_model & model,
|
||||
}
|
||||
|
||||
// Download in parallel
|
||||
common_download_file_multiple(urls, bearer_token, offline, headers);
|
||||
common_download_file_multiple(urls, bearer_token, offline);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & custom_headers) {
|
||||
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||
std::string hf_repo = parts[0];
|
||||
@@ -916,10 +893,10 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
|
||||
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
||||
|
||||
// headers
|
||||
common_header_list headers = custom_headers;
|
||||
headers.push_back({"Accept", "application/json"});
|
||||
std::vector<std::string> headers;
|
||||
headers.push_back("Accept: application/json");
|
||||
if (!bearer_token.empty()) {
|
||||
headers.push_back({"Authorization", "Bearer " + bearer_token});
|
||||
headers.push_back("Authorization: Bearer " + bearer_token);
|
||||
}
|
||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
||||
@@ -1054,10 +1031,9 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
||||
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
|
||||
std::string manifest_url = url_prefix + "/manifests/" + tag;
|
||||
common_remote_params manifest_params;
|
||||
manifest_params.headers.push_back({"Authorization", "Bearer " + token});
|
||||
manifest_params.headers.push_back({"Accept",
|
||||
"application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json"
|
||||
});
|
||||
manifest_params.headers.push_back("Authorization: Bearer " + token);
|
||||
manifest_params.headers.push_back(
|
||||
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
|
||||
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
|
||||
if (manifest_res.first != 200) {
|
||||
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
|
||||
@@ -1094,7 +1070,7 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
||||
std::string local_path = fs_get_cache_file(model_filename);
|
||||
|
||||
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
||||
if (!common_download_file_single(blob_url, local_path, token, false, {})) {
|
||||
if (!common_download_file_single(blob_url, local_path, token, false)) {
|
||||
throw std::runtime_error("Failed to download Docker Model");
|
||||
}
|
||||
|
||||
@@ -1108,11 +1084,11 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
||||
|
||||
#else
|
||||
|
||||
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
|
||||
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
|
||||
bool common_download_model(const common_params_model &, const std::string &, bool) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
|
||||
+5
-18
@@ -1,21 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct common_params_model;
|
||||
|
||||
using common_header = std::pair<std::string, std::string>;
|
||||
using common_header_list = std::vector<common_header>;
|
||||
|
||||
struct common_remote_params {
|
||||
common_header_list headers;
|
||||
long timeout = 0; // in seconds, 0 means no timeout
|
||||
long max_size = 0; // unlimited if 0
|
||||
};
|
||||
|
||||
// get remote file content, returns <http_code, raw_response_body>
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
||||
//
|
||||
// download functionalities
|
||||
//
|
||||
|
||||
struct common_cached_model_info {
|
||||
std::string manifest_path;
|
||||
@@ -50,17 +41,13 @@ struct common_hf_file_res {
|
||||
common_hf_file_res common_get_hf_file(
|
||||
const std::string & hf_repo_with_tag,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers = {}
|
||||
);
|
||||
bool offline);
|
||||
|
||||
// returns true if download succeeded
|
||||
bool common_download_model(
|
||||
const common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers = {}
|
||||
);
|
||||
bool offline);
|
||||
|
||||
// returns list of cached models
|
||||
std::vector<common_cached_model_info> common_list_cached_models();
|
||||
|
||||
+6
-28
@@ -771,8 +771,8 @@ class TextModel(ModelBase):
|
||||
|
||||
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
||||
|
||||
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
|
||||
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
||||
rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
|
||||
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
||||
|
||||
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||
@@ -7212,7 +7212,6 @@ class DeepseekModel(TextModel):
|
||||
"DeepseekV3ForCausalLM",
|
||||
"KimiVLForConditionalGeneration",
|
||||
"YoutuForCausalLM",
|
||||
"YoutuVLForConditionalGeneration"
|
||||
)
|
||||
class DeepseekV2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
@@ -9956,27 +9955,6 @@ class LFM2Model(TextModel):
|
||||
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
||||
|
||||
|
||||
@ModelBase.register("Lfm2Model")
|
||||
class LFM2ColBertModel(LFM2Model):
|
||||
model_arch = gguf.MODEL_ARCH.LFM2
|
||||
dense_tensor_name = "dense_2"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if not name.startswith(self.dense_tensor_name):
|
||||
name = "model." + name
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# dense tensor is stored in a separate safetensors file
|
||||
from safetensors.torch import load_file
|
||||
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
|
||||
assert tensors_file.is_file()
|
||||
tensor = load_file(tensors_file)["linear.weight"]
|
||||
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
|
||||
yield f"{self.dense_tensor_name}.weight", tensor.clone()
|
||||
|
||||
|
||||
@ModelBase.register("Lfm2MoeForCausalLM")
|
||||
class LFM2MoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LFM2MOE
|
||||
@@ -10696,8 +10674,8 @@ class JanusProVisionModel(MmprojModel):
|
||||
return []
|
||||
|
||||
|
||||
@ModelBase.register("YoutuVLForConditionalGeneration")
|
||||
class YoutuVLVisionModel(MmprojModel):
|
||||
@ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
|
||||
class YOUTUVLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
@@ -10974,8 +10952,8 @@ def parse_args() -> argparse.Namespace:
|
||||
|
||||
parser.add_argument(
|
||||
"--sentence-transformers-dense-modules", action="store_true",
|
||||
help=("Whether to include sentence-transformers dense modules. "
|
||||
"It can be used for sentence-transformers models, like google/embeddinggemma-300m. "
|
||||
help=("Whether to include sentence-transformers dense modules."
|
||||
"It can be used for sentence-transformers models, like google/embeddinggemma-300m"
|
||||
"Default these modules are not included.")
|
||||
)
|
||||
|
||||
|
||||
+1
-1
@@ -22,7 +22,7 @@ Legend:
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
|
||||
+350
-413
File diff suppressed because it is too large
Load Diff
@@ -15,7 +15,6 @@ llama_add_compile_flags()
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(debug)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
|
||||
@@ -35,6 +34,7 @@ else()
|
||||
add_subdirectory(gen-docs)
|
||||
add_subdirectory(training)
|
||||
add_subdirectory(diffusion)
|
||||
add_subdirectory(model-conversion)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
# llama.cpp/examples/debug
|
||||
|
||||
This is a utility intended to help debug a model by registering a callback that
|
||||
logs GGML operations and tensor data. It can also store the generated logits or
|
||||
embeddings as well as the prompt and token ids for comparision with the original
|
||||
model.
|
||||
|
||||
### Usage
|
||||
|
||||
```shell
|
||||
llama-debug \
|
||||
--hf-repo ggml-org/models \
|
||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||
--model phi-2-q4_0.gguf \
|
||||
--prompt hello \
|
||||
--save-logits \
|
||||
--verbose
|
||||
```
|
||||
The tensor data is logged as debug and required the --verbose flag. The reason
|
||||
for this is that while useful for a model with many layers there can be a lot of
|
||||
output. You can filter the tensor names using the `--tensor-filter` option.
|
||||
|
||||
A recommended approach is to first run without `--verbose` and see if the
|
||||
generated logits/embeddings are close to the original model. If they are not,
|
||||
then it might be required to inspect tensor by tensor and in that case it is
|
||||
useful to enable the `--verbose` flag along with `--tensor-filter` to focus on
|
||||
specific tensors.
|
||||
|
||||
### Options
|
||||
This example supports all standard `llama.cpp` options and also accepts the
|
||||
following options:
|
||||
```console
|
||||
$ llama-debug --help
|
||||
...
|
||||
|
||||
----- example-specific params -----
|
||||
|
||||
--save-logits save final logits to files for verification (default: false)
|
||||
--logits-output-dir PATH directory for saving logits output files (default: data)
|
||||
--tensor-filter REGEX filter tensor names for debug output (regex pattern, can be specified multiple times)
|
||||
```
|
||||
|
||||
### Output Files
|
||||
|
||||
When `--save-logits` is enabled, the following files are created in the output
|
||||
directory:
|
||||
|
||||
* `llamacpp-<model>[-embeddings].bin` - Binary output (logits or embeddings)
|
||||
* `llamacpp-<model>[-embeddings].txt` - Text output (logits or embeddings, one per line)
|
||||
* `llamacpp-<model>[-embeddings]-prompt.txt` - Prompt text and token IDs
|
||||
* `llamacpp-<model>[-embeddings]-tokens.bin` - Binary token IDs for programmatic comparison
|
||||
|
||||
These files can be compared against the original model's output to verify the
|
||||
converted model.
|
||||
@@ -1,421 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
const std::string usage_template = R"(
|
||||
example usage:
|
||||
|
||||
Print tensors:
|
||||
|
||||
{prog} -m model.gguf -p "Hello my name is" --verbose
|
||||
|
||||
The tensors to be printed can be filtered with --tensor-filter option.
|
||||
|
||||
Save logits/embeddings:
|
||||
|
||||
{prog} -m model.gguf -p "Hello my name is" --save-logits
|
||||
|
||||
Add --embedding to save embeddings)" "\n";
|
||||
|
||||
// Fix the source code indentation above that is introduced by the raw string literal.
|
||||
std::string usage = std::regex_replace(usage_template, std::regex("\\n {8}"), "\n");
|
||||
usage = std::regex_replace(usage, std::regex("\\{prog\\}"), argv[0]);
|
||||
LOG("%s\n", usage.c_str());
|
||||
}
|
||||
|
||||
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
struct callback_data {
|
||||
std::vector<uint8_t> data;
|
||||
std::vector<std::regex> tensor_filters;
|
||||
|
||||
callback_data() = default;
|
||||
|
||||
callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
|
||||
for (const auto & pattern : filter_patterns) {
|
||||
try {
|
||||
std::string anchored_pattern = "^" + pattern;
|
||||
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
|
||||
} catch (const std::regex_error & e) {
|
||||
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
|
||||
}
|
||||
}
|
||||
params.cb_eval = ggml_debug;
|
||||
params.cb_eval_user_data = this;
|
||||
}
|
||||
};
|
||||
|
||||
struct output_data {
|
||||
float * data_ptr = nullptr;
|
||||
int data_size = 0;
|
||||
std::string type_suffix;
|
||||
std::vector<float> storage;
|
||||
std::string prompt;
|
||||
std::vector<llama_token> tokens;
|
||||
|
||||
output_data(llama_context * ctx, const llama_model * model, const common_params & params) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
|
||||
tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||
prompt = params.prompt;
|
||||
|
||||
if (params.embedding) {
|
||||
const int n_embd = llama_model_n_embd_out(model);
|
||||
const bool pooling_enabled = llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE;
|
||||
const int n_embd_count = pooling_enabled ? 1 : tokens.size();
|
||||
const int n_embeddings = n_embd * n_embd_count;
|
||||
|
||||
float * embeddings;
|
||||
if (pooling_enabled) {
|
||||
embeddings = llama_get_embeddings_seq(ctx, 0);
|
||||
storage.resize(n_embeddings);
|
||||
common_embd_normalize(embeddings, storage.data(), n_embeddings, params.embd_normalize);
|
||||
embeddings = storage.data();
|
||||
} else {
|
||||
embeddings = llama_get_embeddings(ctx);
|
||||
}
|
||||
|
||||
data_ptr = embeddings;
|
||||
data_size = n_embeddings;
|
||||
type_suffix = "-embeddings";
|
||||
} else {
|
||||
const float * logits = llama_get_logits_ith(ctx, tokens.size() - 1);
|
||||
const int n_logits = llama_vocab_n_tokens(vocab);
|
||||
|
||||
data_ptr = const_cast<float*>(logits);
|
||||
data_size = n_logits;
|
||||
type_suffix = "";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||
std::string str;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
str += std::to_string(t->ne[i]);
|
||||
if (i + 1 < GGML_MAX_DIMS) {
|
||||
str += ", ";
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
static float ggml_get_float_value(const uint8_t * data, ggml_type type,
|
||||
const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
||||
case GGML_TYPE_F32:
|
||||
return *(const float *) &data[i];
|
||||
case GGML_TYPE_I64:
|
||||
return (float) *(const int64_t *) &data[i];
|
||||
case GGML_TYPE_I32:
|
||||
return (float) *(const int32_t *) &data[i];
|
||||
case GGML_TYPE_I16:
|
||||
return (float) *(const int16_t *) &data[i];
|
||||
case GGML_TYPE_I8:
|
||||
return (float) *(const int8_t *) &data[i];
|
||||
case GGML_TYPE_BF16:
|
||||
return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||
GGML_ASSERT(n > 0);
|
||||
float sum = 0;
|
||||
float sum_sq = 0.0;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
sum += v;
|
||||
sum_sq += v * v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
LOG_DBG(" [\n");
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
if (i2 == n && ne[2] > 2*n) {
|
||||
LOG_DBG(" ..., \n");
|
||||
i2 = ne[2] - n;
|
||||
}
|
||||
LOG_DBG(" [\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
if (i1 == n && ne[1] > 2*n) {
|
||||
LOG_DBG(" ..., \n");
|
||||
i1 = ne[1] - n;
|
||||
}
|
||||
LOG_DBG(" [");
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
if (i0 == n && ne[0] > 2*n) {
|
||||
LOG_DBG("..., ");
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
LOG_DBG("%12.4f", v);
|
||||
if (i0 < ne[0] - 1) {
|
||||
LOG_DBG(", ");
|
||||
}
|
||||
}
|
||||
LOG_DBG("],\n");
|
||||
}
|
||||
LOG_DBG(" ],\n");
|
||||
}
|
||||
LOG_DBG(" ]\n");
|
||||
LOG_DBG(" sum = %f\n", sum);
|
||||
LOG_DBG(" sum_sq = %f\n", sum_sq);
|
||||
}
|
||||
|
||||
if (std::isnan(sum)) {
|
||||
LOG_ERR("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GGML operations callback during the graph execution.
|
||||
*
|
||||
* @param t current tensor
|
||||
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||
* see ggml_backend_sched_eval_callback
|
||||
* @param user_data user data to pass at each call back
|
||||
* @return true to receive data or continue the graph, false otherwise
|
||||
*/
|
||||
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * cb_data = (callback_data *) user_data;
|
||||
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
|
||||
if (ask) {
|
||||
return true; // Always retrieve data
|
||||
}
|
||||
|
||||
bool matches_filter = cb_data->tensor_filters.empty();
|
||||
|
||||
if (!matches_filter) {
|
||||
for (const auto & filter : cb_data->tensor_filters) {
|
||||
if (std::regex_search(t->name, filter)) {
|
||||
matches_filter = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
char src1_str[128] = {0};
|
||||
if (src1) {
|
||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
}
|
||||
|
||||
if (matches_filter) {
|
||||
LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||
t->name,
|
||||
ggml_type_name(t->type),
|
||||
ggml_op_desc(t),
|
||||
src0->name,
|
||||
ggml_ne_string(src0).c_str(),
|
||||
src1 ? src1_str : "",
|
||||
ggml_ne_string(t).c_str());
|
||||
}
|
||||
|
||||
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||
|
||||
if (!is_host) {
|
||||
auto n_bytes = ggml_nbytes(t);
|
||||
cb_data->data.resize(n_bytes);
|
||||
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||
}
|
||||
|
||||
if (!ggml_is_quantized(t->type) && matches_filter) {
|
||||
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
|
||||
std::filesystem::create_directory(output_dir);
|
||||
auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
|
||||
|
||||
// Save logits/embeddings to binary file.
|
||||
{
|
||||
std::filesystem::path filepath{base_path.string() + ".bin"};
|
||||
std::ofstream file{filepath, std::ios::binary};
|
||||
if (!file) {
|
||||
throw std::runtime_error("failed to open binary output file: " + filepath.string());
|
||||
}
|
||||
file.write(reinterpret_cast<const char*>(output.data_ptr), output.data_size * sizeof(float));
|
||||
LOG("Data saved to %s\n", filepath.c_str());
|
||||
}
|
||||
|
||||
// Save logits/embeddings to text file.
|
||||
{
|
||||
std::filesystem::path filepath{base_path.string() + ".txt"};
|
||||
std::ofstream file{filepath};
|
||||
if (!file) {
|
||||
throw std::runtime_error("failed to open text output file: " + filepath.string());
|
||||
}
|
||||
for (int i = 0; i < output.data_size; i++) {
|
||||
file << i << ": " << output.data_ptr[i] << '\n';
|
||||
}
|
||||
LOG("Data saved to %s\n", filepath.c_str());
|
||||
}
|
||||
|
||||
// Save prompt and tokens to text file.
|
||||
{
|
||||
std::filesystem::path filepath{base_path.string() + "-prompt.txt"};
|
||||
std::ofstream file{filepath};
|
||||
if (!file) {
|
||||
throw std::runtime_error("failed to open prompt output file: " + filepath.string());
|
||||
}
|
||||
|
||||
file << "prompt: " << output.prompt << '\n';
|
||||
file << "n_tokens: " << output.tokens.size() << '\n';
|
||||
|
||||
file << "token ids: ";
|
||||
for (size_t i = 0; i < output.tokens.size(); i++) {
|
||||
file << output.tokens[i];
|
||||
if (i + 1 < output.tokens.size()) {
|
||||
file << ", ";
|
||||
}
|
||||
}
|
||||
file << '\n';
|
||||
LOG("Prompt saved to %s\n", filepath.c_str());
|
||||
}
|
||||
|
||||
// Save token ids to binary file.
|
||||
{
|
||||
std::filesystem::path filepath{base_path.string() + "-tokens.bin"};
|
||||
std::ofstream file{filepath, std::ios::binary};
|
||||
if (!file) {
|
||||
throw std::runtime_error("failed to open tokens binary file: " + filepath.string());
|
||||
}
|
||||
file.write(reinterpret_cast<const char*>(output.tokens.data()), output.tokens.size() * sizeof(llama_token));
|
||||
LOG("Tokens saved to %s\n", filepath.c_str());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void print_tokenized_prompt(llama_context * ctx, const std::vector<llama_token> & tokens, const std::string & prompt) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
LOG("Model add_bos: %s\n", llama_vocab_get_add_bos(vocab) ? "true" : "false");
|
||||
LOG("Input prompt: \"%s\"\n", prompt.c_str());
|
||||
LOG("Token ids (%zu):\n", tokens.size());
|
||||
|
||||
for (auto id : tokens) {
|
||||
std::string piece(128, '\0');
|
||||
int n = llama_token_to_piece(vocab, id, piece.data(), piece.size(), 0, true);
|
||||
if (n < 0) {
|
||||
LOG_ERR("failed to convert token %d to piece\n", id);
|
||||
continue;
|
||||
}
|
||||
piece.resize(n);
|
||||
LOG("%s(%d) ", piece.c_str(), id);
|
||||
}
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
static bool run(llama_context * ctx, const common_params & params) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (tokens.empty()) {
|
||||
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
print_tokenized_prompt(ctx, tokens, params.prompt);
|
||||
|
||||
if (params.save_logits) {
|
||||
output_data output {ctx, model, params};
|
||||
std::filesystem::path model_path{params.model.path};
|
||||
std::string model_name{model_path.stem().string()};
|
||||
save_output_data(output, model_name, params.logits_output_dir);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
callback_data cb_data(params, params.tensor_filter);
|
||||
|
||||
auto llama_init = common_init_from_params(params);
|
||||
|
||||
auto * model = llama_init->model();
|
||||
auto * ctx = llama_init->context();
|
||||
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
LOG_ERR("%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
{
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
if (!run(ctx, params)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -553,7 +553,6 @@ int main(int argc, char ** argv) {
|
||||
model_params.n_gpu_layers = params.n_gpu_layers;
|
||||
model_params.devices = params.devices.data();
|
||||
model_params.use_mmap = params.use_mmap;
|
||||
model_params.use_direct_io = params.use_direct_io;
|
||||
model_params.use_mlock = params.use_mlock;
|
||||
model_params.check_tensors = params.check_tensors;
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
}
|
||||
}
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd_out, int embd_norm) {
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
@@ -65,8 +65,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||
}
|
||||
|
||||
float * out = output + embd_pos * n_embd_out;
|
||||
common_embd_normalize(embd, out, n_embd_out, embd_norm);
|
||||
float * out = output + embd_pos * n_embd;
|
||||
common_embd_normalize(embd, out, n_embd, embd_norm);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// allocate output
|
||||
const int n_embd_out = llama_model_n_embd_out(model);
|
||||
std::vector<float> embeddings(n_embd_count * n_embd_out, 0);
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
std::vector<float> embeddings(n_embd_count * n_embd, 0);
|
||||
float * emb = embeddings.data();
|
||||
|
||||
// break into batches
|
||||
@@ -267,8 +267,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
|
||||
float * out = emb + e * n_embd_out;
|
||||
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
|
||||
float * out = emb + e * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
||||
s = 0;
|
||||
common_batch_clear(batch);
|
||||
@@ -280,8 +280,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// final batch
|
||||
float * out = emb + e * n_embd_out;
|
||||
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
|
||||
float * out = emb + e * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
|
||||
if (params.embd_out.empty()) {
|
||||
LOG("\n");
|
||||
@@ -289,19 +289,19 @@ int main(int argc, char ** argv) {
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
for (int j = 0; j < n_embd_count; j++) {
|
||||
LOG("embedding %d: ", j);
|
||||
for (int i = 0; i < std::min(3, n_embd_out); i++) {
|
||||
for (int i = 0; i < std::min(3, n_embd); i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
} else {
|
||||
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
LOG(" ... ");
|
||||
for (int i = n_embd_out - 3; i < n_embd_out; i++) {
|
||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
} else {
|
||||
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
LOG("\n");
|
||||
@@ -320,9 +320,9 @@ int main(int argc, char ** argv) {
|
||||
for (uint32_t i = 0; i < n_cls_out; i++) {
|
||||
// NOTE: if you change this log - update the tests in ci/run.sh
|
||||
if (n_cls_out == 1) {
|
||||
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd_out]);
|
||||
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
|
||||
} else {
|
||||
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd_out + i], cls_out_labels[i].c_str());
|
||||
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -330,11 +330,11 @@ int main(int argc, char ** argv) {
|
||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
LOG("embedding %d: ", j);
|
||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd_out) : n_embd_out); i++) {
|
||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
} else {
|
||||
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
LOG("\n");
|
||||
@@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
|
||||
LOG("\n");
|
||||
for (int i = 0; i < n_prompts; i++) {
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
|
||||
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
LOG("%6.2f ", sim);
|
||||
}
|
||||
LOG("%1.10s", prompts[i].c_str());
|
||||
@@ -368,9 +368,9 @@ int main(int argc, char ** argv) {
|
||||
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||
LOG("[");
|
||||
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd_out + i]);
|
||||
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||
i++;
|
||||
if (i < n_embd_out) LOG(","); else break;
|
||||
if (i < n_embd) LOG(","); else break;
|
||||
}
|
||||
LOG(notArray ? "]\n }" : "]");
|
||||
j++;
|
||||
@@ -383,7 +383,7 @@ int main(int argc, char ** argv) {
|
||||
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||
LOG(" [");
|
||||
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
|
||||
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
LOG("%6.2f", sim);
|
||||
j++;
|
||||
if (j < n_embd_count) LOG(", "); else break;
|
||||
@@ -397,7 +397,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (notArray) LOG("\n}\n");
|
||||
} else if (params.embd_out == "raw") {
|
||||
print_raw_embeddings(emb, n_embd_count, n_embd_out, model, pooling_type, params.embd_normalize);
|
||||
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-debug)
|
||||
add_executable(${TARGET} debug.cpp)
|
||||
set(TARGET llama-logits)
|
||||
add_executable(${TARGET} logits.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -61,7 +61,7 @@ causal-run-converted-model:
|
||||
@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
|
||||
|
||||
causal-verify-logits: causal-run-original-model causal-run-converted-model
|
||||
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
|
||||
@./scripts/causal/compare-logits.py
|
||||
@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
|
||||
|
||||
causal-run-original-embeddings:
|
||||
@@ -138,13 +138,16 @@ embedding-run-original-model-st: embedding-run-original-model
|
||||
embedding-run-converted-model:
|
||||
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
|
||||
$(if $(EMBD_NORMALIZE),--embd-normalize "$(EMBD_NORMALIZE)")
|
||||
$(if $(USE_POOLING),--pooling)
|
||||
|
||||
embedding-run-converted-model-st: USE_POOLING=1
|
||||
embedding-run-converted-model-st: embedding-run-converted-model
|
||||
|
||||
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
|
||||
@./scripts/embedding/compare-embeddings-logits.sh \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model
|
||||
embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
|
||||
@./scripts/embedding/compare-embeddings-logits.sh \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
|
||||
@@ -198,13 +198,14 @@ model, and the other is a text file which allows for manual visual inspection.
|
||||
|
||||
#### Using SentenceTransformer with numbered layers
|
||||
For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
|
||||
03_Dense, 04_Normalize), these will be applied automatically when running the
|
||||
converted model but currently there is a separate target to run the original
|
||||
version:
|
||||
03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:
|
||||
|
||||
```console
|
||||
# Run original model with SentenceTransformer (applies all numbered layers)
|
||||
(venv) $ make embedding-run-original-model-st
|
||||
|
||||
# Run converted model with pooling enabled
|
||||
(venv) $ make embedding-run-converted-model-st
|
||||
```
|
||||
|
||||
This will use the SentenceTransformer library to load and run the model, which
|
||||
@@ -212,17 +213,6 @@ automatically applies all the numbered layers in the correct order. This is
|
||||
particularly useful when comparing with models that should include these
|
||||
additional transformation layers beyond just the base model output.
|
||||
|
||||
The type of normalization can be specified for the converted model but is not
|
||||
strictly necessary as the verification uses cosine similarity and the magnitude
|
||||
of the output vectors does not affect this. But the normalization type can be
|
||||
specified as an argument to the target which might be useful for manual
|
||||
inspection:
|
||||
```console
|
||||
(venv) $ make embedding-verify-logits-st EMBD_NORMALIZE=1
|
||||
```
|
||||
The original model will apply the normalization according to the normalization
|
||||
layer specified in the modules.json configuration file.
|
||||
|
||||
### Model conversion
|
||||
After updates have been made to [gguf-py](../../gguf-py) to add support for the
|
||||
new model the model can be converted to GGUF format using the following command:
|
||||
|
||||
@@ -0,0 +1,268 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <ctype.h>
|
||||
#include <filesystem>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [-pooling] [-embd-norm <norm>] [prompt]\n", argv[0]);
|
||||
printf("\n");
|
||||
printf(" -embd-norm: normalization type for pooled embeddings (default: 2)\n");
|
||||
printf(" -1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::string model_path;
|
||||
std::string prompt = "Hello, my name is";
|
||||
int ngl = 0;
|
||||
bool embedding_mode = false;
|
||||
bool pooling_enabled = false;
|
||||
int32_t embd_norm = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
||||
|
||||
{
|
||||
int i = 1;
|
||||
for (; i < argc; i++) {
|
||||
if (strcmp(argv[i], "-m") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
model_path = argv[++i];
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-ngl") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
try {
|
||||
ngl = std::stoi(argv[++i]);
|
||||
} catch (...) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-embd-mode") == 0) {
|
||||
embedding_mode = true;
|
||||
} else if (strcmp(argv[i], "-pooling") == 0) {
|
||||
pooling_enabled = true;
|
||||
} else if (strcmp(argv[i], "-embd-norm") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
try {
|
||||
embd_norm = std::stoi(argv[++i]);
|
||||
} catch (...) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
// prompt starts here
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (model_path.empty()) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (i < argc) {
|
||||
prompt = argv[i++];
|
||||
for (; i < argc; i++) {
|
||||
prompt += " ";
|
||||
prompt += argv[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_load_all();
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Extract basename from model_path
|
||||
const char * basename = strrchr(model_path.c_str(), '/');
|
||||
basename = (basename == NULL) ? model_path.c_str() : basename + 1;
|
||||
|
||||
char model_name[256];
|
||||
strncpy(model_name, basename, 255);
|
||||
model_name[255] = '\0';
|
||||
|
||||
char * dot = strrchr(model_name, '.');
|
||||
if (dot != NULL && strcmp(dot, ".gguf") == 0) {
|
||||
*dot = '\0';
|
||||
}
|
||||
printf("Model name: %s\n", model_name);
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||
|
||||
std::vector<llama_token> prompt_tokens(n_prompt);
|
||||
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
|
||||
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = n_prompt;
|
||||
ctx_params.n_batch = n_prompt;
|
||||
ctx_params.no_perf = false;
|
||||
if (embedding_mode) {
|
||||
ctx_params.embeddings = true;
|
||||
ctx_params.pooling_type = pooling_enabled ? LLAMA_POOLING_TYPE_MEAN : LLAMA_POOLING_TYPE_NONE;
|
||||
ctx_params.n_ubatch = ctx_params.n_batch;
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("Input prompt: \"%s\"\n", prompt.c_str());
|
||||
printf("Tokenized prompt (%d tokens): ", n_prompt);
|
||||
for (auto id : prompt_tokens) {
|
||||
char buf[128];
|
||||
int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
|
||||
if (n < 0) {
|
||||
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
std::string s(buf, n);
|
||||
printf("%s (%d)", s.c_str(), id);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
float * data_ptr;
|
||||
int data_size;
|
||||
const char * type;
|
||||
std::vector<float> embd_out;
|
||||
|
||||
if (embedding_mode) {
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
|
||||
const int n_embeddings = n_embd * n_embd_count;
|
||||
float * embeddings;
|
||||
type = "-embeddings";
|
||||
|
||||
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
|
||||
embeddings = llama_get_embeddings_seq(ctx, 0);
|
||||
embd_out.resize(n_embeddings);
|
||||
printf("Normalizing embeddings using norm: %d\n", embd_norm);
|
||||
common_embd_normalize(embeddings, embd_out.data(), n_embeddings, embd_norm);
|
||||
embeddings = embd_out.data();
|
||||
} else {
|
||||
embeddings = llama_get_embeddings(ctx);
|
||||
}
|
||||
|
||||
printf("Embedding dimension: %d\n", n_embd);
|
||||
printf("\n");
|
||||
|
||||
// Print embeddings in the specified format
|
||||
for (int j = 0; j < n_embd_count; j++) {
|
||||
printf("embedding %d: ", j);
|
||||
|
||||
// Print first 3 values
|
||||
for (int i = 0; i < 3 && i < n_embd; i++) {
|
||||
printf("%9.6f ", embeddings[j * n_embd + i]);
|
||||
}
|
||||
|
||||
printf(" ... ");
|
||||
|
||||
// Print last 3 values
|
||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||
if (i >= 0) {
|
||||
printf("%9.6f ", embeddings[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Embeddings size: %d\n", n_embeddings);
|
||||
|
||||
data_ptr = embeddings;
|
||||
data_size = n_embeddings;
|
||||
} else {
|
||||
float * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
||||
const int n_logits = llama_vocab_n_tokens(vocab);
|
||||
type = "";
|
||||
printf("Vocab size: %d\n", n_logits);
|
||||
|
||||
data_ptr = logits;
|
||||
data_size = n_logits;
|
||||
}
|
||||
|
||||
std::filesystem::create_directory("data");
|
||||
|
||||
// Save data to binary file
|
||||
char bin_filename[512];
|
||||
snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
|
||||
printf("Saving data to %s\n", bin_filename);
|
||||
|
||||
FILE * f = fopen(bin_filename, "wb");
|
||||
if (f == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
fwrite(data_ptr, sizeof(float), data_size, f);
|
||||
fclose(f);
|
||||
|
||||
// Also save as text for debugging
|
||||
char txt_filename[512];
|
||||
snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type);
|
||||
f = fopen(txt_filename, "w");
|
||||
if (f == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
for (int i = 0; i < data_size; i++) {
|
||||
fprintf(f, "%d: %.6f\n", i, data_ptr[i]);
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
if (!embedding_mode) {
|
||||
printf("First 10 logits: ");
|
||||
for (int i = 0; i < 10 && i < data_size; i++) {
|
||||
printf("%.6f ", data_ptr[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Last 10 logits: ");
|
||||
for (int i = data_size - 10; i < data_size; i++) {
|
||||
if (i >= 0) printf("%.6f ", data_ptr[i]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
printf("Data saved to %s\n", bin_filename);
|
||||
printf("Data saved to %s\n", txt_filename);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -3,11 +3,10 @@
|
||||
import sys
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# Add utils directory to path for direct script execution
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
|
||||
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found]
|
||||
from common import get_model_name_from_env_path # type: ignore[import-not-found]
|
||||
|
||||
def quick_logits_check(pytorch_file, llamacpp_file):
|
||||
"""Lightweight sanity check before NMSE"""
|
||||
@@ -39,7 +38,6 @@ def quick_logits_check(pytorch_file, llamacpp_file):
|
||||
return True
|
||||
|
||||
def main():
|
||||
model_path = os.environ.get('MODEL_PATH')
|
||||
model_name = get_model_name_from_env_path('MODEL_PATH')
|
||||
data_dir = Path("data")
|
||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||
@@ -60,12 +58,6 @@ def main():
|
||||
|
||||
print("Checked all required files were found. Proceeding...\n")
|
||||
|
||||
# Verify tokens as they are a prerequisite for logits comparison.
|
||||
print("🔍 Token Comparison Check")
|
||||
print("=" * 40)
|
||||
if not compare_tokens(f"pytorch-{model_name}", f"llamacpp-{llamacpp_model_name}"):
|
||||
exit_with_warning("\n❌ Token mismatch detected", model_path)
|
||||
print()
|
||||
|
||||
print("🔍 GGML Model Validation for model ", model_name)
|
||||
print("=" * 40)
|
||||
@@ -81,7 +73,8 @@ def main():
|
||||
print(" Ok to proceed with NMSE check...")
|
||||
sys.exit(0)
|
||||
else:
|
||||
exit_with_warning(f"❌ NOK: Top 10 predictions don't match - generation will differ", model_path)
|
||||
print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -67,7 +67,7 @@ with torch.no_grad():
|
||||
last_hidden_states = outputs.hidden_states[-1]
|
||||
|
||||
# Get embeddings for all tokens
|
||||
token_embeddings = last_hidden_states[0].float().cpu().numpy() # Remove batch dimension
|
||||
token_embeddings = last_hidden_states[0].cpu().numpy() # Remove batch dimension
|
||||
|
||||
print(f"Hidden states shape: {last_hidden_states.shape}")
|
||||
print(f"Token embeddings shape: {token_embeddings.shape}")
|
||||
|
||||
@@ -13,6 +13,6 @@ if [ -z "$CONVERTED_MODEL" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
cmake --build ../../build --target llama-logits -j8
|
||||
|
||||
../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
|
||||
../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"
|
||||
|
||||
@@ -21,6 +21,6 @@ fi
|
||||
echo $CONVERTED_MODEL
|
||||
echo $MODEL_TESTING_PROMPT
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
cmake --build ../../build --target llama-logits -j8
|
||||
|
||||
../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
|
||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"
|
||||
|
||||
@@ -7,11 +7,12 @@ import importlib
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from pathlib import Path
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
from utils.common import debug_hook, save_output_data
|
||||
from utils.common import debug_hook
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description="Process model with specified path")
|
||||
@@ -125,7 +126,6 @@ def main():
|
||||
device = next(model.parameters()).device
|
||||
prompt = get_prompt(args)
|
||||
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
||||
token_ids = input_ids[0].cpu().tolist()
|
||||
|
||||
print(f"Input tokens: {input_ids}")
|
||||
print(f"Input text: {repr(prompt)}")
|
||||
@@ -151,6 +151,19 @@ def main():
|
||||
print(f"Last token logits shape: {last_logits.shape}")
|
||||
print(f"Vocab size: {len(last_logits)}")
|
||||
|
||||
data_dir = Path("data")
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
bin_filename = data_dir / f"pytorch-{model_name}.bin"
|
||||
txt_filename = data_dir / f"pytorch-{model_name}.txt"
|
||||
|
||||
# Save to file for comparison
|
||||
last_logits.astype(np.float32).tofile(bin_filename)
|
||||
|
||||
# Also save as text file for easy inspection
|
||||
with open(txt_filename, "w") as f:
|
||||
for i, logit in enumerate(last_logits):
|
||||
f.write(f"{i}: {logit:.6f}\n")
|
||||
|
||||
# Print some sample logits for quick verification
|
||||
print(f"First 10 logits: {last_logits[:10]}")
|
||||
print(f"Last 10 logits: {last_logits[-10:]}")
|
||||
@@ -162,7 +175,8 @@ def main():
|
||||
token = tokenizer.decode([idx])
|
||||
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
|
||||
|
||||
save_output_data(last_logits, token_ids, prompt, model_name)
|
||||
print(f"Saved bin logits to: {bin_filename}")
|
||||
print(f"Saved txt logist to: {txt_filename}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -5,7 +5,7 @@ set -e
|
||||
# Parse command line arguments
|
||||
CONVERTED_MODEL=""
|
||||
PROMPTS_FILE=""
|
||||
EMBD_NORMALIZE="2"
|
||||
USE_POOLING=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
@@ -13,9 +13,9 @@ while [[ $# -gt 0 ]]; do
|
||||
PROMPTS_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--embd-normalize)
|
||||
EMBD_NORMALIZE="$2"
|
||||
shift 2
|
||||
--pooling)
|
||||
USE_POOLING="1"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
@@ -50,5 +50,10 @@ fi
|
||||
|
||||
echo $CONVERTED_MODEL
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
|
||||
cmake --build ../../build --target llama-logits -j8
|
||||
# TODO: update logits.cpp to accept a --file/-f option for the prompt
|
||||
if [ -n "$USE_POOLING" ]; then
|
||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode -pooling "$PROMPT"
|
||||
else
|
||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
|
||||
fi
|
||||
|
||||
@@ -3,15 +3,13 @@
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import AutoTokenizer, AutoConfig, AutoModel
|
||||
import torch
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
from utils.common import save_output_data
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description='Run original embedding model')
|
||||
@@ -171,7 +169,6 @@ def main():
|
||||
return_tensors="pt"
|
||||
)
|
||||
tokens = encoded['input_ids'][0]
|
||||
token_ids = tokens.cpu().tolist()
|
||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||
print(f"{token_id:6d} -> '{token_str}'")
|
||||
@@ -188,7 +185,6 @@ def main():
|
||||
)
|
||||
|
||||
tokens = encoded['input_ids'][0]
|
||||
token_ids = tokens.cpu().tolist()
|
||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||
print(f"{token_id:6d} -> '{token_str}'")
|
||||
@@ -232,11 +228,24 @@ def main():
|
||||
|
||||
print()
|
||||
|
||||
data_dir = Path("data")
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
|
||||
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
|
||||
|
||||
flattened_embeddings = all_embeddings.flatten()
|
||||
flattened_embeddings.astype(np.float32).tofile(bin_filename)
|
||||
|
||||
with open(txt_filename, "w") as f:
|
||||
idx = 0
|
||||
for j in range(n_embd_count):
|
||||
for value in all_embeddings[j]:
|
||||
f.write(f"{idx}: {value:.6f}\n")
|
||||
idx += 1
|
||||
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
|
||||
print("")
|
||||
|
||||
save_output_data(flattened_embeddings, token_ids, prompt_text, model_name, type_suffix="-embeddings")
|
||||
print(f"Saved bin embeddings to: {bin_filename}")
|
||||
print(f"Saved txt embeddings to: {txt_filename}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -3,11 +3,6 @@
|
||||
import os
|
||||
import sys
|
||||
import torch
|
||||
import transformers
|
||||
import json
|
||||
import textwrap
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_model_name_from_env_path(env_path_name):
|
||||
@@ -153,147 +148,3 @@ def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_
|
||||
# Patch it
|
||||
setattr(module, function_name, debug_rope)
|
||||
print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
|
||||
|
||||
|
||||
def save_output_data(data, tokens, prompt, model_name, type_suffix="", output_dir="data"):
|
||||
"""
|
||||
Save output data (logits/embeddings), tokens, and prompt to files.
|
||||
|
||||
Args:
|
||||
data: numpy array of floats (logits or embeddings)
|
||||
tokens: list or array of token IDs
|
||||
prompt: string containing the input prompt
|
||||
model_name: name of the model
|
||||
type_suffix: optional suffix like "-embeddings" (default: "")
|
||||
output_dir: directory to save files (default: "data")
|
||||
|
||||
Creates the following files in output_dir:
|
||||
- pytorch-{model_name}{type_suffix}.bin
|
||||
- pytorch-{model_name}{type_suffix}.txt
|
||||
- pytorch-{model_name}{type_suffix}-prompt.txt
|
||||
- pytorch-{model_name}{type_suffix}-tokens.bin
|
||||
"""
|
||||
data_dir = Path(output_dir)
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
base_path = data_dir / f"pytorch-{model_name}{type_suffix}"
|
||||
|
||||
# Convert and flatten logits/embeddings
|
||||
data = data.cpu().numpy() if isinstance(data, torch.Tensor) else np.asarray(data)
|
||||
data = data.flatten() if data.ndim > 1 else data
|
||||
|
||||
# Save logits/embedding files
|
||||
data.astype(np.float32).tofile(f"{base_path}.bin")
|
||||
print(f"Data saved to {base_path}.bin")
|
||||
|
||||
with open(f"{base_path}.txt", "w") as f:
|
||||
f.writelines(f"{i}: {value:.6f}\n" for i, value in enumerate(data))
|
||||
print(f"Data saved to {base_path}.txt")
|
||||
|
||||
# Convert and flatten tokens
|
||||
tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.asarray(tokens)
|
||||
tokens = tokens.flatten() if tokens.ndim > 1 else tokens
|
||||
|
||||
# Save token binary file
|
||||
tokens.astype(np.int32).tofile(f"{base_path}-tokens.bin")
|
||||
print(f"Tokens saved to {base_path}-tokens.bin")
|
||||
|
||||
# Save prompt file
|
||||
with open(f"{base_path}-prompt.txt", "w") as f:
|
||||
f.write(f"prompt: {prompt}\n")
|
||||
f.write(f"n_tokens: {len(tokens)}\n")
|
||||
f.write(f"token ids: {', '.join(str(int(tid)) for tid in tokens)}\n")
|
||||
print(f"Prompt saved to {base_path}-prompt.txt")
|
||||
|
||||
|
||||
def compare_tokens(original, converted, type_suffix="", output_dir="data"):
|
||||
data_dir = Path(output_dir)
|
||||
|
||||
# Read tokens from both models
|
||||
tokens1_file = data_dir / f"{original}{type_suffix}-tokens.bin"
|
||||
tokens2_file = data_dir / f"{converted}{type_suffix}-tokens.bin"
|
||||
|
||||
if not tokens1_file.exists():
|
||||
print(f"Error: Token file not found: {tokens1_file}")
|
||||
return False
|
||||
|
||||
if not tokens2_file.exists():
|
||||
print(f"Error: Token file not found: {tokens2_file}")
|
||||
return False
|
||||
|
||||
tokens1 = np.fromfile(tokens1_file, dtype=np.int32)
|
||||
tokens2 = np.fromfile(tokens2_file, dtype=np.int32)
|
||||
|
||||
print(f"\nComparing tokens between:")
|
||||
print(f" Original : {original} ({len(tokens1)} tokens)")
|
||||
print(f" Converted: {converted} ({len(tokens2)} tokens)")
|
||||
|
||||
if len(tokens1) != len(tokens2):
|
||||
print(f"\n❌ Token count mismatch: {len(tokens1)} vs {len(tokens2)}")
|
||||
return False
|
||||
|
||||
if np.array_equal(tokens1, tokens2):
|
||||
print(f"\n✅ All {len(tokens1)} tokens match!")
|
||||
return True
|
||||
|
||||
mismatches = np.where(tokens1 != tokens2)[0]
|
||||
print(f"\n❌ Found {len(mismatches)} mismatched tokens:")
|
||||
|
||||
num_to_show = min(len(mismatches), 10)
|
||||
for idx in mismatches[:num_to_show]:
|
||||
print(f" Position {idx}: {tokens1[idx]} vs {tokens2[idx]}")
|
||||
|
||||
if len(mismatches) > num_to_show:
|
||||
print(f" ... and {len(mismatches) - num_to_show} more mismatches")
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def show_version_warning(current_version, model_version):
|
||||
if not model_version:
|
||||
return False
|
||||
|
||||
try:
|
||||
from packaging.version import parse, InvalidVersion
|
||||
try:
|
||||
return parse(current_version) < parse(model_version)
|
||||
except InvalidVersion:
|
||||
return current_version != model_version
|
||||
except ImportError:
|
||||
return current_version != model_version
|
||||
|
||||
def get_model_transformers_version(model_path):
|
||||
if not model_path:
|
||||
return None
|
||||
|
||||
config_path = Path(model_path) / "config.json"
|
||||
if not config_path.is_file():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
return config.get("transformers_version")
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def exit_with_warning(message, model_path):
|
||||
print(message)
|
||||
|
||||
if model_path and transformers is not None:
|
||||
model_transformers_version = get_model_transformers_version(model_path)
|
||||
transformers_version = transformers.__version__
|
||||
if show_version_warning(transformers_version, model_transformers_version):
|
||||
warning_message = f"""
|
||||
=====================================================================
|
||||
Verification failure might be due to a transformers version mismatch:
|
||||
|
||||
Current transformers version: {transformers_version}
|
||||
Model's required version : {model_transformers_version}
|
||||
|
||||
Consider installing the version specified by the model's config:
|
||||
pip install transformers=={model_transformers_version}
|
||||
=====================================================================
|
||||
"""
|
||||
print(textwrap.dedent(warning_message))
|
||||
sys.exit(1)
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from common import compare_tokens # type: ignore
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Compare tokens between two models',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
%(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
'original',
|
||||
help='Original model name'
|
||||
)
|
||||
parser.add_argument(
|
||||
'converted',
|
||||
help='Converted model name'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-s', '--suffix',
|
||||
default='',
|
||||
help='Type suffix (e.g., "-embeddings")'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d', '--data-dir',
|
||||
default='data',
|
||||
help='Directory containing token files (default: data)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-v', '--verbose',
|
||||
action='store_true',
|
||||
help='Print prompts from both models'
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
if args.verbose:
|
||||
from pathlib import Path
|
||||
data_dir = Path(args.data_dir)
|
||||
|
||||
prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
|
||||
prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
|
||||
|
||||
if prompt1_file.exists():
|
||||
print(f"\nOriginal model prompt ({args.original}):")
|
||||
print(f" {prompt1_file.read_text().strip()}")
|
||||
|
||||
if prompt2_file.exists():
|
||||
print(f"\nConverted model prompt ({args.converted}):")
|
||||
print(f" {prompt2_file.read_text().strip()}")
|
||||
|
||||
print()
|
||||
|
||||
result = compare_tokens(
|
||||
args.original,
|
||||
args.converted,
|
||||
type_suffix=args.suffix,
|
||||
output_dir=args.data_dir
|
||||
)
|
||||
|
||||
# Enable the script to be used in shell scripts so that they can check
|
||||
# the exit code for success/failure.
|
||||
sys.exit(0 if result else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -4,10 +4,8 @@ import numpy as np
|
||||
import argparse
|
||||
import os
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
|
||||
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
|
||||
|
||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
|
||||
@@ -159,24 +157,9 @@ def main():
|
||||
else:
|
||||
prompt = args.prompt
|
||||
|
||||
python_emb_path = Path(args.python_embeddings)
|
||||
cpp_emb_path = Path(args.cpp_embeddings)
|
||||
|
||||
# Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
|
||||
python_model_name = python_emb_path.stem.replace("-embeddings", "")
|
||||
cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
|
||||
|
||||
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
|
||||
print("=" * 70)
|
||||
|
||||
# First verify tokens match before comparing embeddings
|
||||
print("\n🔍 Token Comparison Check")
|
||||
print("=" * 70)
|
||||
data_dir = python_emb_path.parent
|
||||
if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
|
||||
exit_with_warning("\n❌ Token mismatch detected", args.model_path)
|
||||
print()
|
||||
|
||||
# Single prompt detailed comparison
|
||||
print(f"\nTesting with prompt: '{prompt}'")
|
||||
|
||||
@@ -236,7 +219,7 @@ def main():
|
||||
elif avg_cross_sim > 0.70:
|
||||
print("⚠️ FAIR: Models have some differences")
|
||||
else:
|
||||
exit_with_warning("❌ POOR: Models are significantly different", args.model_path)
|
||||
print("❌ POOR: Models are significantly different")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
|
||||
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||
|
||||
// allocate output
|
||||
const int n_embd_out = llama_model_n_embd_out(model);
|
||||
std::vector<float> embeddings(n_chunks * n_embd_out, 0);
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
std::vector<float> embeddings(n_chunks * n_embd, 0);
|
||||
float * emb = embeddings.data();
|
||||
|
||||
// break into batches
|
||||
@@ -232,8 +232,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
|
||||
float * out = emb + p * n_embd_out;
|
||||
batch_process(ctx, batch, out, s, n_embd_out);
|
||||
float * out = emb + p * n_embd;
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
common_batch_clear(batch);
|
||||
p += s;
|
||||
s = 0;
|
||||
@@ -245,12 +245,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// final batch
|
||||
float * out = emb + p * n_embd_out;
|
||||
batch_process(ctx, batch, out, s, n_embd_out);
|
||||
float * out = emb + p * n_embd;
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
|
||||
// save embeddings to chunks
|
||||
for (int i = 0; i < n_chunks; i++) {
|
||||
chunks[i].embedding = std::vector<float>(emb + i * n_embd_out, emb + (i + 1) * n_embd_out);
|
||||
chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
|
||||
// clear tokens as they are no longer needed
|
||||
chunks[i].tokens.clear();
|
||||
}
|
||||
@@ -266,8 +266,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
batch_add_seq(query_batch, query_tokens, 0);
|
||||
|
||||
std::vector<float> query_emb(n_embd_out, 0);
|
||||
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd_out);
|
||||
std::vector<float> query_emb(n_embd, 0);
|
||||
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
|
||||
common_batch_clear(query_batch);
|
||||
|
||||
@@ -275,7 +275,7 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
std::vector<std::pair<int, float>> similarities;
|
||||
for (int i = 0; i < n_chunks; i++) {
|
||||
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd_out);
|
||||
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
|
||||
similarities.push_back(std::make_pair(i, sim));
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 3
|
||||
#define RPC_PROTO_MINOR_VERSION 6
|
||||
#define RPC_PROTO_MINOR_VERSION 7
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
|
||||
@@ -1963,7 +1963,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor *
|
||||
acl_tensor_ptr acl_weight_tensor;
|
||||
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||
if (weight_to_nz && is_matmul_weight(weight)) {
|
||||
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
|
||||
} else {
|
||||
|
||||
@@ -103,7 +103,7 @@ const ggml_cann_device_info & ggml_cann_info();
|
||||
void ggml_cann_set_device(int32_t device);
|
||||
int32_t ggml_cann_get_device();
|
||||
|
||||
std::optional<std::string> get_env_as_lowercase(const std::string & name);
|
||||
std::optional<std::string> get_env(const std::string & name);
|
||||
bool parse_bool(const std::string & value);
|
||||
int parse_integer(const std::string & value);
|
||||
|
||||
|
||||
@@ -105,10 +105,10 @@ int32_t ggml_cann_get_device() {
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the value of the specified environment variable (name) as lowercase.
|
||||
* @brief Get the value of the specified environment variable (name).
|
||||
* if not empty, return a std::string object
|
||||
*/
|
||||
std::optional<std::string> get_env_as_lowercase(const std::string & name) {
|
||||
std::optional<std::string> get_env(const std::string & name) {
|
||||
const char * val = std::getenv(name.c_str());
|
||||
if (!val) {
|
||||
return std::nullopt;
|
||||
@@ -122,7 +122,7 @@ std::optional<std::string> get_env_as_lowercase(const std::string & name) {
|
||||
* @brief Verify whether the environment variable is a valid value.
|
||||
*/
|
||||
bool parse_bool(const std::string & value) {
|
||||
static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
|
||||
std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
|
||||
return valid_values.find(value) != valid_values.end();
|
||||
}
|
||||
|
||||
@@ -259,7 +259,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
||||
* @param device The device ID to associate with this buffer pool.
|
||||
*/
|
||||
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
||||
disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
||||
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -452,7 +452,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
||||
* @param device The device ID to associate with this buffer pool.
|
||||
*/
|
||||
explicit ggml_cann_pool_buf(int device) : device(device) {
|
||||
disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
||||
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -764,7 +764,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
* @return A unique pointer to the created CANN pool.
|
||||
*/
|
||||
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
|
||||
std::string mem_pool_type = get_env_as_lowercase("GGML_CANN_MEM_POOL").value_or("");
|
||||
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
||||
|
||||
if (mem_pool_type == "prio") {
|
||||
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
||||
@@ -1217,7 +1217,7 @@ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
// Why aclrtSynchronizeDevice?
|
||||
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
||||
if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
||||
@@ -1442,7 +1442,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||
|
||||
// last line must bigger than 32, because every single op deal at
|
||||
// least 32 bytes.
|
||||
@@ -2136,7 +2136,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
|
||||
#endif // USE_ACL_GRAPH
|
||||
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
||||
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
||||
static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));
|
||||
static bool opt_fusion = parse_bool(get_env("GGML_CANN_OPERATOR_FUSION").value_or(""));
|
||||
|
||||
if (!use_cann_graph || cann_graph_capture_required) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
@@ -2201,7 +2201,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
|
||||
#ifdef USE_ACL_GRAPH
|
||||
bool use_cann_graph = true;
|
||||
|
||||
static bool prefill_use_graph = parse_bool(get_env_as_lowercase("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
|
||||
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
|
||||
if (!prefill_use_graph) {
|
||||
// Do not use acl_graph for prefill.
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
@@ -2541,6 +2541,27 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determines if a tensor operation should be offloaded to the CANN
|
||||
* backend.
|
||||
*
|
||||
* This function checks if a given tensor operation should be offloaded to the
|
||||
* CANN backend based on the operation type and the size of the tensor. It
|
||||
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
||||
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend.
|
||||
* @param op Pointer to the tensor operation to check.
|
||||
* @return bool Returns true if the operation should be offloaded, otherwise
|
||||
* false.
|
||||
*/
|
||||
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
GGML_UNUSED(dev);
|
||||
|
||||
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Records an event on the CANN backend stream.
|
||||
*
|
||||
@@ -2616,7 +2637,6 @@ struct ggml_backend_cann_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -2693,26 +2713,6 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
|
||||
return ggml_backend_cann_host_buffer_type();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determines if a tensor operation should be offloaded to the CANN
|
||||
* backend.
|
||||
*
|
||||
* This function checks if a given tensor operation should be offloaded to the
|
||||
* CANN backend based on the operation type and the size of the tensor. It
|
||||
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
||||
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend.
|
||||
* @param op Pointer to the tensor operation to check.
|
||||
* @return bool Returns true if the operation should be offloaded, otherwise
|
||||
* false.
|
||||
*/
|
||||
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
||||
|
||||
return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Creates a new event for the CANN backend device.
|
||||
*
|
||||
@@ -2829,14 +2829,12 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
|
||||
if (!initialized) {
|
||||
aclInit(nullptr);
|
||||
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
||||
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
|
||||
dev_ctx->description = aclrtGetSocName();
|
||||
dev_ctx->device = i;
|
||||
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
ggml_cann_set_device(i);
|
||||
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
|
||||
/* .reg = */ ®,
|
||||
|
||||
@@ -47,10 +47,7 @@ if (CUDAToolkit_FOUND)
|
||||
# check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
|
||||
# However, the architectures 120a-real and 121a-real should work with basically any CMake version and
|
||||
# until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real)
|
||||
endif()
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.9")
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 121a-real)
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real 121a-real)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -1036,7 +1036,7 @@ struct ggml_tensor_extra_gpu {
|
||||
#define USE_CUDA_GRAPH
|
||||
#endif
|
||||
|
||||
struct ggml_cuda_graph_node_properties {
|
||||
struct ggml_graph_node_properties {
|
||||
void * node_address;
|
||||
ggml_op node_op;
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
@@ -1061,25 +1061,10 @@ struct ggml_cuda_graph {
|
||||
std::vector<cudaGraphNode_t> nodes;
|
||||
bool disable_due_to_gpu_arch = false;
|
||||
bool disable_due_to_too_many_updates = false;
|
||||
bool disable_due_to_failed_graph_capture = false;
|
||||
int number_consecutive_updates = 0;
|
||||
std::vector<ggml_cuda_graph_node_properties> props;
|
||||
|
||||
void record_update(bool use_graph, bool update_required) {
|
||||
if (use_graph && update_required) {
|
||||
number_consecutive_updates++;
|
||||
} else {
|
||||
number_consecutive_updates = 0;
|
||||
}
|
||||
if (number_consecutive_updates >= 4) {
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||
disable_due_to_too_many_updates = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_enabled() const {
|
||||
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
||||
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
|
||||
}
|
||||
bool cuda_graphs_enabled = false;
|
||||
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
@@ -11,12 +11,10 @@
|
||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||
|
||||
// log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
|
||||
// by the VKQ accumulators is effectively being shifted up by a factor of 2.
|
||||
// by the VKQ accumulators is effectively being shifted up by a factor of 8.
|
||||
// This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
|
||||
// However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
|
||||
// Still, the value range should be shifted as much as necessary but as little as possible.
|
||||
// The macro on the following line shifts it by a factor of 2**3=8, as was needed to fix https://github.com/ggml-org/llama.cpp/issues/18606 .
|
||||
#define FATTN_KQ_MAX_OFFSET (3.0f*0.6931f)
|
||||
#define FATTN_KQ_MAX_OFFSET 0.6931f
|
||||
|
||||
typedef void (* fattn_kernel_t)(
|
||||
const char * __restrict__ Q,
|
||||
|
||||
@@ -2853,9 +2853,9 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
bool use_cuda_graph) {
|
||||
|
||||
bool use_cuda_graph = true;
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
|
||||
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
|
||||
@@ -2915,41 +2915,41 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
return use_cuda_graph;
|
||||
}
|
||||
|
||||
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
|
||||
props->node_address = node->data;
|
||||
props->node_op = node->op;
|
||||
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||
graph_node_properties->node_address = node->data;
|
||||
graph_node_properties->node_op = node->op;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
props->ne[i] = node->ne[i];
|
||||
props->nb[i] = node->nb[i];
|
||||
graph_node_properties->ne[i] = node->ne[i];
|
||||
graph_node_properties->nb[i] = node->nb[i];
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
props->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
||||
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
||||
}
|
||||
memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
}
|
||||
|
||||
static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
|
||||
if (node->data != props->node_address &&
|
||||
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||
if (node->data != graph_node_properties->node_address &&
|
||||
node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node->op != props->node_op) {
|
||||
if (node->op != graph_node_properties->node_op) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (node->ne[i] != props->ne[i]) {
|
||||
if (node->ne[i] != graph_node_properties->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (node->nb[i] != props->nb[i]) {
|
||||
if (node->nb[i] != graph_node_properties->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i] &&
|
||||
node->src[i]->data != props->src_address[i] &&
|
||||
node->src[i]->data != graph_node_properties->src_address[i] &&
|
||||
node->op != GGML_OP_VIEW
|
||||
) {
|
||||
return false;
|
||||
@@ -2957,55 +2957,44 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
|
||||
}
|
||||
|
||||
if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
|
||||
memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
||||
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
||||
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
||||
|
||||
bool res = false;
|
||||
bool cuda_graph_update_required = false;
|
||||
|
||||
if (cuda_ctx->cuda_graph->instance == nullptr) {
|
||||
res = true;
|
||||
cuda_graph_update_required = true;
|
||||
}
|
||||
|
||||
// Check if the graph size has changed
|
||||
if (cuda_ctx->cuda_graph->props.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) {
|
||||
res = true;
|
||||
cuda_ctx->cuda_graph->props.resize(cgraph->n_nodes + cgraph->n_leafs);
|
||||
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
||||
cuda_graph_update_required = true;
|
||||
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
||||
}
|
||||
|
||||
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
||||
// and store properties to allow this comparison for the next token
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
bool props_match = true;
|
||||
if (!res) {
|
||||
props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &cuda_ctx->cuda_graph->props[i]);
|
||||
bool has_matching_properties = true;
|
||||
if (!cuda_graph_update_required) {
|
||||
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
||||
}
|
||||
if (!props_match) {
|
||||
res = true;
|
||||
if (!has_matching_properties) {
|
||||
cuda_graph_update_required = true;
|
||||
}
|
||||
ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[i], cgraph->nodes[i]);
|
||||
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
bool props_match= true;
|
||||
if (!res) {
|
||||
props_match = ggml_cuda_graph_node_properties_match(cgraph->leafs[i], &cuda_ctx->cuda_graph->props[cgraph->n_nodes + i]);
|
||||
}
|
||||
if (!props_match) {
|
||||
res = true;
|
||||
}
|
||||
ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[cgraph->n_nodes + i], cgraph->leafs[i]);
|
||||
}
|
||||
|
||||
return res;
|
||||
return cuda_graph_update_required;
|
||||
}
|
||||
|
||||
static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
|
||||
#if CUDART_VERSION >= 12000
|
||||
cudaGraphExecUpdateResultInfo result_info;
|
||||
@@ -3236,11 +3225,10 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required) {
|
||||
bool graph_evaluated_or_captured = false;
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
|
||||
bool is_concurrent_event_active = false;
|
||||
@@ -3710,7 +3698,7 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
||||
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||
}
|
||||
if (cuda_graph_update_required) { // Update graph executable
|
||||
ggml_cuda_graph_update_executable(cuda_ctx);
|
||||
update_cuda_graph_executable(cuda_ctx);
|
||||
}
|
||||
// Launch graph
|
||||
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
||||
@@ -3720,25 +3708,43 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
||||
static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
||||
|
||||
// Objects required for CUDA Graph
|
||||
if (cuda_ctx->cuda_graph == nullptr) {
|
||||
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
|
||||
}
|
||||
|
||||
bool use_cuda_graph = true;
|
||||
|
||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
||||
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return cuda_ctx->cuda_graph->is_enabled();
|
||||
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
|
||||
// or previous graph capture failure.
|
||||
// Also disable for multi-gpu for now. TO DO investigate
|
||||
if (disable_cuda_graphs_due_to_env
|
||||
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|
||||
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|
||||
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
|
||||
use_cuda_graph = false;
|
||||
}
|
||||
|
||||
cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
|
||||
#else
|
||||
return false;
|
||||
bool use_cuda_graph = false;
|
||||
#endif // USE_CUDA_GRAPH
|
||||
|
||||
return use_cuda_graph;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
@@ -3749,14 +3755,30 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
bool use_cuda_graph = false;
|
||||
bool cuda_graph_update_required = false;
|
||||
|
||||
// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
|
||||
// we call it here instead.
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
|
||||
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
||||
|
||||
if (cuda_ctx->cuda_graph->is_enabled()) {
|
||||
cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
|
||||
if (use_cuda_graph) {
|
||||
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
|
||||
cuda_ctx->cuda_graph->record_update(use_cuda_graph, cuda_graph_update_required);
|
||||
use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
|
||||
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
||||
} else {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
||||
}
|
||||
|
||||
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
||||
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
||||
cuda_ctx->cuda_graph->cuda_graphs_enabled = false;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif // USE_CUDA_GRAPH
|
||||
|
||||
@@ -3770,7 +3792,9 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
||||
}
|
||||
|
||||
ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required);
|
||||
bool graph_evaluated_or_captured = false;
|
||||
|
||||
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -3803,7 +3827,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
||||
|
||||
const bool use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
|
||||
const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
||||
|
||||
static bool enable_graph_optimization = [] {
|
||||
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
|
||||
@@ -4122,7 +4146,6 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -4677,9 +4700,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
|
||||
return get_op_batch_size(op) >= min_batch_size;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
||||
@@ -4847,7 +4872,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
@@ -4861,7 +4885,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
|
||||
@@ -34,11 +34,13 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
// CUDA_GRAPHS_DISABLED
|
||||
((ncols > 65536) &&
|
||||
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->is_enabled())) ||
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
|
||||
// CUDA_GRAPHS ENABLED
|
||||
((ncols > 32768) &&
|
||||
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->is_enabled()))) {
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
||||
#else
|
||||
(ncols > 65536)) {
|
||||
#endif // USE_CUDA_GRAPH
|
||||
|
||||
@@ -333,28 +333,6 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
}
|
||||
|
||||
if (amd_wmma_available(cc)) {
|
||||
// RDNA 4 is consistently worse on rocblas
|
||||
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
// High expert counts almost always better on MMQ
|
||||
// due to a large amount of graph splits
|
||||
// https://github.com/ggml-org/llama.cpp/pull/18202
|
||||
if (n_experts >= 64) {
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
// These quants are really bad on MMQ
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
// These quants are usually worse but not always
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return ne11 <= 128;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -114,7 +114,7 @@ __global__ void __launch_bounds__(splitD, 1)
|
||||
#endif // __clang__
|
||||
|
||||
// assumes as many threads as d_state
|
||||
template <int c_factor, int d_state>
|
||||
template <int splitH, int d_state>
|
||||
__global__ void __launch_bounds__(d_state, 1)
|
||||
ssm_scan_f32_group(
|
||||
const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
||||
@@ -125,25 +125,20 @@ __global__ void __launch_bounds__(d_state, 1)
|
||||
const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
|
||||
const int64_t s_off, const int64_t n_head, const int64_t d_head, const int64_t n_group, const int64_t n_tok) {
|
||||
|
||||
const int warp = threadIdx.x / WARP_SIZE;
|
||||
const int lane = threadIdx.x % WARP_SIZE;
|
||||
const int warp_idx = blockIdx.x * c_factor + warp;
|
||||
|
||||
const int head_idx = warp_idx / d_head;
|
||||
const int head_off = (warp_idx % d_head) * sizeof(float);
|
||||
const int seq_idx = blockIdx.y;
|
||||
const int head_idx = (blockIdx.x * splitH) / d_head;
|
||||
const int head_off = ((blockIdx.x * splitH) % d_head) * sizeof(float);
|
||||
const int seq_idx = blockIdx.y;
|
||||
|
||||
const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float);
|
||||
|
||||
// TODO: refactor strides to be in elements/floats instead of bytes to be cleaner and consistent with the rest of the codebase
|
||||
const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||
const float * x_warp = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float)));
|
||||
const float * dt_warp = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float));
|
||||
const float * A_warp = (const float *) ((const char *) src3 + head_idx * src3_nb1);
|
||||
const float * B_warp = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off));
|
||||
const float * C_warp = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off));
|
||||
float * y_warp = dst + (seq_idx * n_tok * n_head * d_head) + warp_idx;
|
||||
float * s_warp = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||
const float * s0_block = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||
const float * x_block = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + blockIdx.x * splitH * sizeof(float));
|
||||
const float * dt_block = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float));
|
||||
const float * A_block = (const float *) ((const char *) src3 + head_idx * src3_nb1);
|
||||
const float * B_block = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off));
|
||||
const float * C_block = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off));
|
||||
float * y_block = dst + (seq_idx * n_tok * n_head * d_head) + blockIdx.x * splitH;
|
||||
float * s_block = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||
|
||||
// strides across n_seq_tokens
|
||||
const int stride_x = src1_nb2 / sizeof(float);
|
||||
@@ -152,42 +147,80 @@ __global__ void __launch_bounds__(d_state, 1)
|
||||
const int stride_C = src5_nb2 / sizeof(float);
|
||||
const int stride_y = n_head * d_head;
|
||||
|
||||
float state[c_factor];
|
||||
float state_sum = 0.0f;
|
||||
float state[splitH];
|
||||
// for the parallel accumulation
|
||||
__shared__ float stateC[splitH * d_state];
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < c_factor; j++) {
|
||||
state[j] = s0_warp[WARP_SIZE * j + lane];
|
||||
for (int j = 0; j < splitH; j++) {
|
||||
state[j] = s0_block[j * d_state + threadIdx.x];
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < n_tok; i++) {
|
||||
// NOTE: dt_soft_plus, dA and x_dt have the same value for a warp here.
|
||||
// Recalculation is intentional; sharing via shuffles/smem proved slower due to sync overhead.
|
||||
const float dt_soft_plus = (dt_warp[i * stride_dt] <= 20.0f ? log1pf(expf(dt_warp[i * stride_dt])) : dt_warp[i * stride_dt]);
|
||||
// TODO: only calculate dA and dt_soft_plus once per head instead of every splitH head elements
|
||||
// TODO: only calculate B and C once per head group
|
||||
// NOTE: dt_soft_plus, dA and x_dt have the same value across threads here.
|
||||
float dt_soft_plus = dt_block[i * stride_dt];
|
||||
if (dt_soft_plus <= 20.0f) {
|
||||
dt_soft_plus = log1pf(expf(dt_soft_plus));
|
||||
}
|
||||
const float dA = expf(dt_soft_plus * A_block[0]);
|
||||
const float B = B_block[i * stride_B + threadIdx.x];
|
||||
const float C = C_block[i * stride_C + threadIdx.x];
|
||||
|
||||
state_sum = 0.0f;
|
||||
const float dA = expf(dt_soft_plus * A_warp[0]);
|
||||
const float x_dt = x_warp[i * stride_x] * dt_soft_plus;
|
||||
// across d_head
|
||||
#pragma unroll
|
||||
for (int j = 0; j < c_factor; j++) {
|
||||
const float B_val = B_warp[i * stride_B + WARP_SIZE * j + lane];
|
||||
const float C_val = C_warp[i * stride_C + WARP_SIZE * j + lane];
|
||||
state[j] = (state[j] * dA) + (B_val * x_dt);
|
||||
state_sum += state[j] * C_val;
|
||||
for (int j = 0; j < splitH; j++) {
|
||||
const float x_dt = x_block[i * stride_x + j] * dt_soft_plus;
|
||||
|
||||
state[j] = (state[j] * dA) + (B * x_dt);
|
||||
|
||||
stateC[j * d_state + threadIdx.x] = state[j] * C;
|
||||
}
|
||||
|
||||
// parallel accumulation for output
|
||||
state_sum = warp_reduce_sum(state_sum);
|
||||
__syncthreads();
|
||||
|
||||
if (lane == 0) {
|
||||
y_warp[i * stride_y] = state_sum;
|
||||
// parallel accumulation for stateC
|
||||
// TODO: simplify
|
||||
{
|
||||
static_assert((d_state & -d_state) == d_state, "the state size has to be a power of 2");
|
||||
static_assert((splitH & -splitH) == splitH, "splitH has to be a power of 2");
|
||||
|
||||
// reduce until w matches the warp size
|
||||
// TODO: does this work even when the physical warp size is 64?
|
||||
#pragma unroll
|
||||
for (int w = d_state; w > WARP_SIZE; w >>= 1) {
|
||||
// (assuming there are d_state threads)
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ((w >> 1) * splitH + d_state - 1) / d_state; j++) {
|
||||
// TODO: check for bank conflicts
|
||||
const int k = (threadIdx.x % (w >> 1)) + (d_state * (threadIdx.x / (w >> 1))) + j * d_state * (d_state / (w >> 1));
|
||||
stateC[k] += stateC[k + (w >> 1)];
|
||||
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static_assert(splitH >= d_state / WARP_SIZE);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < splitH / (d_state / WARP_SIZE); j++) {
|
||||
float y = stateC[(threadIdx.x % WARP_SIZE) + d_state * (threadIdx.x / WARP_SIZE) + j * d_state * (d_state / WARP_SIZE)];
|
||||
y = warp_reduce_sum(y);
|
||||
|
||||
// store the above accumulations
|
||||
if (threadIdx.x % WARP_SIZE == 0) {
|
||||
const int k = threadIdx.x / WARP_SIZE + j * (d_state / WARP_SIZE);
|
||||
y_block[i * stride_y + k] = y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// write back the state
|
||||
#pragma unroll
|
||||
for (int j = 0; j < c_factor; j++) {
|
||||
s_warp[WARP_SIZE * j + lane] = state[j];
|
||||
for (int j = 0; j < splitH; j++) {
|
||||
s_block[j * d_state + threadIdx.x] = state[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,24 +231,27 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
|
||||
const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
|
||||
cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
// NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
|
||||
if (src3_nb1 == sizeof(float)) {
|
||||
// Mamba-2
|
||||
if (d_state == 128) {
|
||||
constexpr int threads = 128;
|
||||
constexpr int num_warps = threads/WARP_SIZE;
|
||||
|
||||
const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
|
||||
ssm_scan_f32_group<128/WARP_SIZE, 128><<<blocks, threads, 0, stream>>>(
|
||||
GGML_ASSERT(d_state % threads == 0);
|
||||
// NOTE: can be any power of two between 4 and 64
|
||||
const int splitH = 16;
|
||||
GGML_ASSERT(head_dim % splitH == 0);
|
||||
const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
|
||||
ssm_scan_f32_group<16, 128><<<blocks, threads, 0, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||
} else if (d_state == 256) { // Falcon-H1
|
||||
constexpr int threads = 256;
|
||||
constexpr int num_warps = threads/WARP_SIZE;
|
||||
|
||||
const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
|
||||
ssm_scan_f32_group<256/WARP_SIZE, 256><<<blocks, threads, 0, stream>>>(
|
||||
const int threads = 256;
|
||||
// NOTE: can be any power of two between 8 and 64
|
||||
const int splitH = 16;
|
||||
GGML_ASSERT(head_dim % splitH == 0);
|
||||
const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
|
||||
ssm_scan_f32_group<16, 256><<<blocks, threads, 0, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||
@@ -224,7 +260,6 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
}
|
||||
} else {
|
||||
// Mamba-1
|
||||
constexpr int threads = 128;
|
||||
GGML_ASSERT(n_head % threads == 0);
|
||||
GGML_ASSERT(head_dim == 1);
|
||||
GGML_ASSERT(n_group == 1);
|
||||
|
||||
@@ -1773,37 +1773,6 @@ static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * src1 = op->src[1];
|
||||
const struct ggml_tensor * src2 = op->src[2];
|
||||
const struct ggml_tensor * src3 = op->src[3];
|
||||
const struct ggml_tensor * src4 = op->src[4];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
// Check for F16 support only as requested
|
||||
if ((src0->type != GGML_TYPE_F16 && src0->type != GGML_TYPE_F32) || src1->type != GGML_TYPE_F16 || src2->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src3 && src3->type != GGML_TYPE_F16) { // mask
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src4 && src4->type != GGML_TYPE_F32) { // sinks
|
||||
return false;
|
||||
}
|
||||
|
||||
// For now we support F32 or F16 output as htp backend often converts output on the fly if needed,
|
||||
// but the op implementation writes to F16 or F32.
|
||||
// Let's assume dst can be F32 or F16.
|
||||
if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return opt_experimental;
|
||||
}
|
||||
|
||||
static bool hex_supported_src0_type(ggml_type t) {
|
||||
return t == GGML_TYPE_F32;
|
||||
}
|
||||
@@ -1846,11 +1815,12 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
if (dst->type != GGML_TYPE_F32) {
|
||||
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
|
||||
// TODO: add support for non-cont tensors
|
||||
if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1866,6 +1836,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||
return false; // typically the lm-head which would be too large for VTCM
|
||||
}
|
||||
|
||||
// if ((src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3])) return false;
|
||||
if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1914,10 +1885,21 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
||||
}
|
||||
break;
|
||||
|
||||
case GGML_TYPE_F16:
|
||||
if (!opt_experimental) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: add support for non-cont tensors
|
||||
if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2078,46 +2060,6 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_set_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0]; // values
|
||||
const struct ggml_tensor * src1 = op->src[1]; // indices
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1->type != GGML_TYPE_I32 && src1->type != GGML_TYPE_I64) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_get_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0]; // values
|
||||
const struct ggml_tensor * src1 = op->src[1]; // indices
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1->type != GGML_TYPE_I32 && src1->type != GGML_TYPE_I64) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const int32_t * op_params = &op->op_params[0];
|
||||
|
||||
@@ -2212,11 +2154,6 @@ static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_t
|
||||
d->offset = (uint8_t *) t->data - buf->base;
|
||||
d->size = ggml_nbytes(t);
|
||||
|
||||
if (!d->size) {
|
||||
// Some requests contain srcs where ggml_nbytes() returns 0 but the rest of the op is non-empty
|
||||
d->size = 64;
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
|
||||
// Flush CPU
|
||||
@@ -2302,17 +2239,6 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
req->op = HTP_OP_GET_ROWS;
|
||||
|
||||
size_t n_bufs = 0;
|
||||
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
template <bool _is_src0_constant>
|
||||
static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
switch (t->op) {
|
||||
@@ -2340,17 +2266,6 @@ static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer *
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_set_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
req->op = HTP_OP_SET_ROWS;
|
||||
|
||||
size_t n_bufs = 0;
|
||||
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
||||
|
||||
@@ -2362,11 +2277,6 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
|
||||
supported = true;
|
||||
break;
|
||||
|
||||
case GGML_OP_SCALE:
|
||||
req->op = HTP_OP_SCALE;
|
||||
supported = true;
|
||||
break;
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
|
||||
req->op = HTP_OP_UNARY_SILU;
|
||||
@@ -2421,21 +2331,6 @@ static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
||||
req->op = HTP_OP_FLASH_ATTN_EXT;
|
||||
|
||||
size_t n_bufs = 0;
|
||||
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->src3, &bufs[n_bufs], t->src[3], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->src4, &bufs[n_bufs], t->src[4], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
||||
return sess->name.c_str();
|
||||
@@ -2522,7 +2417,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
||||
break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
@@ -2545,18 +2439,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
ggml_hexagon_dispatch_op<init_flash_attn_ext_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
case GGML_OP_SET_ROWS:
|
||||
ggml_hexagon_dispatch_op<init_set_rows_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
case GGML_OP_GET_ROWS:
|
||||
ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
default:
|
||||
GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
|
||||
}
|
||||
@@ -2896,7 +2778,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
break;
|
||||
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
supp = ggml_hexagon_supported_unary(sess, op);
|
||||
break;
|
||||
|
||||
@@ -2924,18 +2805,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_rope(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
supp = ggml_hexagon_supported_flash_attn_ext(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_SET_ROWS:
|
||||
supp = ggml_hexagon_supported_set_rows(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_GET_ROWS:
|
||||
supp = ggml_hexagon_supported_get_rows(sess, op);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -28,9 +28,6 @@ add_library(${HTP_LIB} SHARED
|
||||
softmax-ops.c
|
||||
act-ops.c
|
||||
rope-ops.c
|
||||
flash-attn-ops.c
|
||||
set-rows-ops.c
|
||||
get-rows-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
@@ -1,566 +0,0 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#ifdef HTP_DEBUG
|
||||
# define FARF_HIGH 1
|
||||
#endif
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_mem.h>
|
||||
#include <HAP_perf.h>
|
||||
#include <hexagon_protos.h>
|
||||
#include <hexagon_types.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-dma.h"
|
||||
#include "htp-msg.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "ops-utils.h"
|
||||
|
||||
// Dot product of FP32 and FP16 vectors, accumulating to float
|
||||
static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) {
|
||||
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
|
||||
const HVX_Vector * restrict vx = (const HVX_Vector * restrict) x; // fp16
|
||||
|
||||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
HVX_Vector rsum = Q6_V_vsplat_R(0);
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
|
||||
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
|
||||
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
|
||||
|
||||
// Load x (fp16)
|
||||
HVX_Vector x_hf = vx[i];
|
||||
|
||||
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
|
||||
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
|
||||
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
|
||||
|
||||
// Load x (fp16)
|
||||
HVX_Vector x_hf = vx[i];
|
||||
|
||||
// Zero-out unused elements
|
||||
// Note that we need to clear both x and y because they may contain NANs
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
|
||||
x_hf = Q6_V_vand_QV(bmask, x_hf);
|
||||
y_hf = Q6_V_vand_QV(bmask, y_hf);
|
||||
|
||||
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
}
|
||||
|
||||
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s));
|
||||
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
|
||||
|
||||
hvx_vec_store_u(r, 4, rsum);
|
||||
}
|
||||
|
||||
// Dot product of two F16 vectors, accumulating to float
|
||||
static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict x, const void * restrict y, unsigned int n, float s) {
|
||||
const HVX_Vector * restrict vx = (const HVX_Vector * restrict) x; // fp16
|
||||
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp16
|
||||
|
||||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
HVX_Vector rsum = Q6_V_vsplat_R(0);
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
HVX_Vector y_hf = vy[i];
|
||||
HVX_Vector x_hf = vx[i];
|
||||
|
||||
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector y_hf = vy[i];
|
||||
|
||||
// Load x (fp16) and zero-out unused elements
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
|
||||
HVX_Vector x_hf = Q6_V_vand_QV(bmask, vx[i]);
|
||||
|
||||
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
}
|
||||
|
||||
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s));
|
||||
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
|
||||
hvx_vec_store_u(r, 4, rsum);
|
||||
}
|
||||
|
||||
// MAD: y (F32) += x (F16) * v (float)
|
||||
static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict x, int n, float s) {
|
||||
const HVX_Vector * restrict ptr_x = (const HVX_Vector *) x;
|
||||
HVX_Vector * restrict ptr_y = (HVX_Vector *) y;
|
||||
|
||||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
HVX_Vector S = hvx_vec_splat_fp16(s);
|
||||
|
||||
uint32_t i = 0;
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; ++i) {
|
||||
// Multiply x * s -> pair of F32 vectors
|
||||
HVX_VectorPair xs_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x[i]), S);
|
||||
ptr_y[i*2] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(xs_p), ptr_y[i*2]));
|
||||
ptr_y[i*2+1] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(xs_p), ptr_y[i*2+1]));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_VectorPair xs_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x[i]), S);
|
||||
|
||||
HVX_Vector xs = Q6_V_lo_W(xs_p);
|
||||
i = 2 * i; // index for ptr_y
|
||||
|
||||
if (nloe >= 32) {
|
||||
ptr_y[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
|
||||
nloe -= 32; ++i; xs = Q6_V_hi_W(xs_p);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector xy = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
|
||||
hvx_vec_store_u(&ptr_y[i], nloe * 4, xy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define FLASH_ATTN_BLOCK_SIZE 128
|
||||
|
||||
static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, int nth) {
|
||||
const struct htp_tensor * q = &octx->src0;
|
||||
const struct htp_tensor * k = &octx->src1;
|
||||
const struct htp_tensor * v = &octx->src2;
|
||||
const struct htp_tensor * mask = (octx->src3.data) ? &octx->src3 : NULL;
|
||||
const struct htp_tensor * sinks = (octx->src4.data) ? &octx->src4 : NULL;
|
||||
struct htp_tensor * dst = &octx->dst;
|
||||
|
||||
const uint32_t neq0 = q->ne[0];
|
||||
const uint32_t neq1 = q->ne[1];
|
||||
const uint32_t neq2 = q->ne[2];
|
||||
const uint32_t neq3 = q->ne[3];
|
||||
|
||||
const uint32_t nek0 = k->ne[0];
|
||||
const uint32_t nek1 = k->ne[1];
|
||||
const uint32_t nek2 = k->ne[2];
|
||||
const uint32_t nek3 = k->ne[3];
|
||||
|
||||
const uint32_t nev0 = v->ne[0];
|
||||
const uint32_t nev1 = v->ne[1];
|
||||
const uint32_t nev2 = v->ne[2];
|
||||
const uint32_t nev3 = v->ne[3];
|
||||
|
||||
const uint32_t nbq1 = q->nb[1];
|
||||
const uint32_t nbq2 = q->nb[2];
|
||||
const uint32_t nbq3 = q->nb[3];
|
||||
|
||||
const uint32_t nbk1 = k->nb[1];
|
||||
const uint32_t nbk2 = k->nb[2];
|
||||
const uint32_t nbk3 = k->nb[3];
|
||||
|
||||
const uint32_t nbv1 = v->nb[1];
|
||||
const uint32_t nbv2 = v->nb[2];
|
||||
const uint32_t nbv3 = v->nb[3];
|
||||
|
||||
const uint32_t ne1 = dst->ne[1];
|
||||
const uint32_t ne2 = dst->ne[2];
|
||||
const uint32_t ne3 = dst->ne[3];
|
||||
|
||||
const uint32_t nb1 = dst->nb[1];
|
||||
const uint32_t nb2 = dst->nb[2];
|
||||
const uint32_t nb3 = dst->nb[3];
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
float logit_softcap = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) octx->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) octx->op_params + 1, sizeof(float));
|
||||
memcpy(&logit_softcap, (float *) octx->op_params + 2, sizeof(float));
|
||||
|
||||
if (logit_softcap != 0) {
|
||||
scale /= logit_softcap;
|
||||
}
|
||||
|
||||
// total rows in q
|
||||
const uint32_t nr = neq1*neq2*neq3;
|
||||
|
||||
const uint32_t dr = (nr + nth - 1) / nth;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
if (ir0 >= ir1) return;
|
||||
|
||||
dma_queue * dma = octx->ctx->dma[ith];
|
||||
|
||||
const uint32_t DK = nek0;
|
||||
const uint32_t DV = nev0;
|
||||
|
||||
const size_t size_q_row = DK * ((q->type == HTP_TYPE_F32) ? 4 : 2);
|
||||
const size_t size_q_row_padded = htp_round_up(size_q_row, 128);
|
||||
|
||||
const size_t size_k_row = DK * sizeof(__fp16);
|
||||
const size_t size_v_row = DV * sizeof(__fp16);
|
||||
const size_t size_m_row = FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16); // Treat block as one row for mask
|
||||
|
||||
const size_t size_k_row_padded = htp_round_up(size_k_row, 128);
|
||||
const size_t size_v_row_padded = htp_round_up(size_v_row, 128);
|
||||
|
||||
const size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE;
|
||||
const size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE;
|
||||
const size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
|
||||
|
||||
// Scratchpad buffers for Q, K, V, Mask, and VKQ32 accumulator
|
||||
uint8_t * spad_q = octx->src0_spad.data + octx->src0_spad.size_per_thread * ith;
|
||||
uint8_t * spad_k = octx->src1_spad.data + octx->src1_spad.size_per_thread * ith;
|
||||
uint8_t * spad_v = octx->src2_spad.data + octx->src2_spad.size_per_thread * ith;
|
||||
uint8_t * spad_m = octx->src3_spad.data + octx->src3_spad.size_per_thread * ith;
|
||||
uint8_t * spad_a = octx->dst_spad.data + octx->dst_spad.size_per_thread * ith;
|
||||
|
||||
const uint32_t n_head = neq2;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
for (uint32_t ir = ir0; ir < ir1; ++ir) {
|
||||
const uint32_t iq3 = fastdiv(ir, &octx->src0_div21);
|
||||
const uint32_t iq2 = fastdiv(ir - iq3*neq2*neq1, &octx->src0_div1);
|
||||
const uint32_t iq1 = (ir - iq3*neq2*neq1 - iq2 * neq1);
|
||||
|
||||
const uint32_t ik3 = fastdiv(iq3, &octx->broadcast_rk3);
|
||||
const uint32_t ik2 = fastdiv(iq2, &octx->broadcast_rk2);
|
||||
|
||||
const uint32_t iv3 = fastdiv(iq3, &octx->broadcast_rv3);
|
||||
const uint32_t iv2 = fastdiv(iq2, &octx->broadcast_rv2);
|
||||
|
||||
// Fetch Q row
|
||||
const uint8_t * q_row_ptr = (const uint8_t *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3);
|
||||
dma_queue_push(dma, dma_make_ptr(spad_q, q_row_ptr), size_q_row_padded, nbq1, size_q_row, 1);
|
||||
|
||||
const uint32_t h = iq2; // head index
|
||||
const float slope = (max_bias > 0.0f) ? (h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1)) : 1.0f;
|
||||
|
||||
float S = 0.0f; // sum
|
||||
float M = -INFINITY; // maximum KQ value
|
||||
|
||||
// Clear accumulator
|
||||
float * VKQ32 = (float *) spad_a;
|
||||
memset(VKQ32, 0, DV * sizeof(float));
|
||||
|
||||
const __fp16 * mp_base = NULL;
|
||||
if (mask) {
|
||||
const uint32_t im2 = fastmodulo(iq2, mask->ne[2], &octx->src3_div2);
|
||||
const uint32_t im3 = fastmodulo(iq3, mask->ne[3], &octx->src3_div3);
|
||||
mp_base = (const __fp16 *) ((const uint8_t *) mask->data + iq1*mask->nb[1] + im2*mask->nb[2] + im3*mask->nb[3]);
|
||||
}
|
||||
|
||||
const uint32_t n_blocks = (nek1 + FLASH_ATTN_BLOCK_SIZE - 1) / FLASH_ATTN_BLOCK_SIZE;
|
||||
|
||||
// Prefetch first two blocks
|
||||
for (uint32_t ib = 0; ib < MIN(n_blocks, 2); ++ib) {
|
||||
const uint32_t ic_start = ib * FLASH_ATTN_BLOCK_SIZE;
|
||||
const uint32_t current_block_size = MIN(FLASH_ATTN_BLOCK_SIZE, nek1 - ic_start);
|
||||
|
||||
// K
|
||||
const uint8_t * k_src = (const uint8_t *) k->data + (ic_start*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||
uint8_t * k_dst = spad_k + (ib % 2) * size_k_block;
|
||||
dma_queue_push(dma, dma_make_ptr(k_dst, k_src), size_k_row_padded, nbk1, size_k_row, current_block_size);
|
||||
|
||||
// V
|
||||
const uint8_t * v_src = (const uint8_t *) v->data + (ic_start*nbv1 + iv2*nbv2 + iv3*nbv3);
|
||||
uint8_t * v_dst = spad_v + (ib % 2) * size_v_block;
|
||||
dma_queue_push(dma, dma_make_ptr(v_dst, v_src), size_v_row_padded, nbv1, size_v_row, current_block_size);
|
||||
|
||||
// Mask
|
||||
if (mask) {
|
||||
const uint8_t * m_src = (const uint8_t *) (mp_base + ic_start);
|
||||
uint8_t * m_dst = spad_m + (ib % 2) * size_m_block;
|
||||
// Mask is 1D contiguous for this row
|
||||
dma_queue_push(dma, dma_make_ptr(m_dst, m_src), current_block_size * 2, current_block_size * 2, current_block_size * 2, 1);
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t * q_ptr_vtcm = dma_queue_pop(dma).dst;
|
||||
|
||||
for (uint32_t ib = 0; ib < n_blocks; ++ib) {
|
||||
const uint32_t ic_start = ib * FLASH_ATTN_BLOCK_SIZE;
|
||||
const uint32_t current_block_size = MIN(FLASH_ATTN_BLOCK_SIZE, nek1 - ic_start);
|
||||
|
||||
// Wait for DMA
|
||||
uint8_t * k_base = dma_queue_pop(dma).dst; // K
|
||||
uint8_t * v_base = dma_queue_pop(dma).dst; // V
|
||||
__fp16 * m_base = mask ? dma_queue_pop(dma).dst : NULL; // M
|
||||
|
||||
// Inner loop processing the block from VTCM
|
||||
uint32_t ic = 0;
|
||||
|
||||
// Process in blocks of 32 (VLEN_FP32)
|
||||
for (; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32) {
|
||||
// 1. Compute scores
|
||||
float __attribute__((aligned(VLEN))) scores_arr[VLEN_FP32];
|
||||
for (int j = 0; j < VLEN_FP32; ++j) {
|
||||
const uint32_t cur_ic = ic + j;
|
||||
const uint8_t * k_ptr = k_base + cur_ic * size_k_row_padded;
|
||||
if (q->type == HTP_TYPE_F32) {
|
||||
hvx_dot_f32_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
|
||||
} else {
|
||||
hvx_dot_f16_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
|
||||
}
|
||||
}
|
||||
|
||||
HVX_Vector scores = *(HVX_Vector *) scores_arr;
|
||||
|
||||
// 2. Softcap
|
||||
if (logit_softcap != 0.0f) {
|
||||
scores = hvx_vec_tanh_fp32(scores);
|
||||
scores = Q6_Vqf32_vmpy_VsfVsf(scores, hvx_vec_splat_fp32(logit_softcap));
|
||||
scores = Q6_Vsf_equals_Vqf32(scores);
|
||||
}
|
||||
|
||||
// 3. Mask
|
||||
if (mask) {
|
||||
const __fp16 * mp = m_base + ic;
|
||||
HVX_Vector m_vals_fp16 = *(const HVX_UVector *) mp;
|
||||
|
||||
HVX_Vector one_fp16 = Q6_Vh_vsplat_R(0x3c00);
|
||||
HVX_VectorPair m_vals_fp32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_fp16), one_fp16);
|
||||
|
||||
HVX_Vector m_vals_fp32 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(m_vals_fp32_pair));
|
||||
|
||||
HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);
|
||||
HVX_Vector add_val = Q6_Vqf32_vmpy_VsfVsf(m_vals_fp32, slope_vec);
|
||||
scores = Q6_Vqf32_vadd_VsfVsf(scores, Q6_Vsf_equals_Vqf32(add_val));
|
||||
scores = Q6_Vsf_equals_Vqf32(scores);
|
||||
}
|
||||
|
||||
// 4. Online Softmax Update
|
||||
HVX_Vector v_max = hvx_vec_reduce_max_fp32(scores);
|
||||
float m_block = hvx_vec_get_fp32(v_max);
|
||||
|
||||
float M_old = M;
|
||||
float M_new = (m_block > M) ? m_block : M;
|
||||
M = M_new;
|
||||
|
||||
float ms = expf(M_old - M_new);
|
||||
|
||||
hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms);
|
||||
S = S * ms;
|
||||
|
||||
HVX_Vector M_new_vec = hvx_vec_splat_fp32(M_new);
|
||||
HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_new_vec);
|
||||
HVX_Vector P = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(scores_shifted));
|
||||
|
||||
HVX_Vector p_sum_vec = hvx_vec_fp32_reduce_sum(P);
|
||||
float p_sum = hvx_vec_get_fp32(p_sum_vec);
|
||||
S += p_sum;
|
||||
|
||||
// 5. Accumulate V
|
||||
float __attribute__((aligned(VLEN))) p_arr[VLEN_FP32];
|
||||
*(HVX_Vector*)p_arr = P;
|
||||
|
||||
for (int j = 0; j < VLEN_FP32; ++j) {
|
||||
const uint32_t cur_ic = ic + j;
|
||||
const uint8_t * v_ptr = v_base + cur_ic * size_v_row_padded;
|
||||
hvx_mad_f32_f16_aa(VKQ32, v_ptr, DV, p_arr[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Leftover
|
||||
for (; ic < current_block_size; ++ic) {
|
||||
float s_val;
|
||||
const uint8_t * k_ptr = k_base + ic * size_k_row_padded;
|
||||
|
||||
if (q->type == HTP_TYPE_F32) {
|
||||
hvx_dot_f32_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
|
||||
} else {
|
||||
hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
|
||||
}
|
||||
|
||||
if (logit_softcap != 0.0f) {
|
||||
s_val = logit_softcap * tanhf(s_val);
|
||||
}
|
||||
|
||||
if (mask) {
|
||||
const float m_val = m_base[ic];
|
||||
s_val += slope * m_val;
|
||||
}
|
||||
|
||||
const float Mold = M;
|
||||
float ms = 1.0f;
|
||||
float vs = 1.0f;
|
||||
|
||||
if (s_val > M) {
|
||||
M = s_val;
|
||||
ms = expf(Mold - M);
|
||||
hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms);
|
||||
} else {
|
||||
vs = expf(s_val - M);
|
||||
}
|
||||
|
||||
const uint8_t * v_ptr = v_base + ic * size_v_row_padded;
|
||||
|
||||
hvx_mad_f32_f16_aa(VKQ32, v_ptr, DV, vs);
|
||||
|
||||
S = S * ms + vs;
|
||||
}
|
||||
|
||||
// Issue DMA for next+1 block (if exists)
|
||||
if (ib + 2 < n_blocks) {
|
||||
const uint32_t next_ib = ib + 2;
|
||||
const uint32_t next_ic_start = next_ib * FLASH_ATTN_BLOCK_SIZE;
|
||||
const uint32_t next_block_size = MIN(FLASH_ATTN_BLOCK_SIZE, nek1 - next_ic_start);
|
||||
|
||||
// K
|
||||
const uint8_t * k_src = (const uint8_t *) k->data + (next_ic_start*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||
dma_queue_push(dma, dma_make_ptr(k_base, k_src), size_k_row_padded, nbk1, size_k_row, next_block_size);
|
||||
|
||||
// V
|
||||
const uint8_t * v_src = (const uint8_t *) v->data + (next_ic_start*nbv1 + iv2*nbv2 + iv3*nbv3);
|
||||
dma_queue_push(dma, dma_make_ptr(v_base, v_src), size_v_row_padded, nbv1, size_v_row, next_block_size);
|
||||
|
||||
// Mask
|
||||
if (mask) {
|
||||
const uint8_t * m_src = (const uint8_t *) (mp_base + next_ic_start);
|
||||
dma_queue_push(dma, dma_make_ptr(m_base, m_src), next_block_size * 2, next_block_size * 2, next_block_size * 2, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sinks
|
||||
if (sinks) {
|
||||
const float s = ((float *)((char *) sinks->data))[h];
|
||||
|
||||
float ms = 1.0f;
|
||||
float vs = 1.0f;
|
||||
|
||||
if (s > M) {
|
||||
ms = expf(M - s);
|
||||
hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms);
|
||||
} else {
|
||||
vs = expf(s - M);
|
||||
}
|
||||
|
||||
S = S * ms + vs;
|
||||
}
|
||||
|
||||
const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
|
||||
hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, S_inv);
|
||||
|
||||
// Store result
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// dst is permuted
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1;
|
||||
|
||||
if (dst->type == HTP_TYPE_F32) {
|
||||
hvx_copy_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
|
||||
} else if (dst->type == HTP_TYPE_F16) {
|
||||
hvx_copy_fp16_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void htp_flash_attn_ext_job(unsigned int n, unsigned int i, void * data) {
|
||||
struct htp_ops_context * octx = data;
|
||||
flash_attn_ext_f16_thread(octx, i, n);
|
||||
}
|
||||
|
||||
int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * q = &octx->src0;
|
||||
const struct htp_tensor * k = &octx->src1;
|
||||
const struct htp_tensor * v = &octx->src2;
|
||||
const struct htp_tensor * mask = (octx->src3.type != HTP_TYPE_COUNT) ? &octx->src3 : NULL;
|
||||
struct htp_tensor * dst = &octx->dst;
|
||||
|
||||
// Check support
|
||||
if ((q->type != HTP_TYPE_F16 && q->type != HTP_TYPE_F32) ||
|
||||
k->type != HTP_TYPE_F16 ||
|
||||
v->type != HTP_TYPE_F16) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
octx->src0_div21 = init_fastdiv_values(q->ne[2] * q->ne[1]);
|
||||
octx->src0_div1 = init_fastdiv_values(q->ne[1]);
|
||||
|
||||
octx->broadcast_rk2 = init_fastdiv_values(q->ne[2]/k->ne[2]);
|
||||
octx->broadcast_rk3 = init_fastdiv_values(q->ne[3]/k->ne[3]);
|
||||
octx->broadcast_rv2 = init_fastdiv_values(q->ne[2]/v->ne[2]);
|
||||
octx->broadcast_rv3 = init_fastdiv_values(q->ne[3]/v->ne[3]);
|
||||
|
||||
if (mask) {
|
||||
octx->src3_div2 = init_fastdiv_values(mask->ne[2]);
|
||||
octx->src3_div3 = init_fastdiv_values(mask->ne[3]);
|
||||
}
|
||||
|
||||
size_t size_q_row_padded = htp_round_up(q->ne[0] * (q->type == HTP_TYPE_F32 ? 4 : 2), 128);
|
||||
size_t size_k_row_padded = htp_round_up(k->ne[0] * sizeof(__fp16), 128);
|
||||
size_t size_v_row_padded = htp_round_up(v->ne[0] * sizeof(__fp16), 128);
|
||||
|
||||
size_t size_q_block = size_q_row_padded * 1; // single row for now
|
||||
size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE;
|
||||
size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE;
|
||||
size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
|
||||
|
||||
size_t size_vkq_acc = htp_round_up(v->ne[0] * sizeof(float), 128); // VKQ32
|
||||
|
||||
octx->src0_spad.size_per_thread = size_q_block * 1;
|
||||
octx->src1_spad.size_per_thread = size_k_block * 2;
|
||||
octx->src2_spad.size_per_thread = size_v_block * 2;
|
||||
octx->src3_spad.size_per_thread = mask ? size_m_block * 2 : 0;
|
||||
octx->dst_spad.size_per_thread = size_vkq_acc;
|
||||
|
||||
octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
|
||||
octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads;
|
||||
octx->src2_spad.size = octx->src2_spad.size_per_thread * octx->n_threads;
|
||||
octx->src3_spad.size = octx->src3_spad.size_per_thread * octx->n_threads;
|
||||
octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads;
|
||||
|
||||
size_t total_spad = octx->src0_spad.size + octx->src1_spad.size + octx->src2_spad.size + octx->src3_spad.size + octx->dst_spad.size;
|
||||
|
||||
if (octx->ctx->vtcm_size < total_spad) {
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
}
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
||||
octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size;
|
||||
octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size;
|
||||
|
||||
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, htp_flash_attn_ext_job, octx, octx->n_threads);
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -1,112 +0,0 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#ifdef HTP_DEBUG
|
||||
# define FARF_HIGH 1
|
||||
#endif
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_mem.h>
|
||||
#include <HAP_perf.h>
|
||||
#include <hexagon_protos.h>
|
||||
#include <hexagon_types.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-msg.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "ops-utils.h"
|
||||
|
||||
#define get_rows_preamble \
|
||||
const uint32_t ne00 = octx->src0.ne[0]; \
|
||||
const uint32_t ne01 = octx->src0.ne[1]; \
|
||||
const uint32_t ne02 = octx->src0.ne[2]; \
|
||||
const uint32_t ne03 = octx->src0.ne[3]; \
|
||||
\
|
||||
const uint32_t ne10 = octx->src1.ne[0]; \
|
||||
const uint32_t ne11 = octx->src1.ne[1]; \
|
||||
const uint32_t ne12 = octx->src1.ne[2]; \
|
||||
\
|
||||
const uint32_t nb01 = octx->src0.nb[1]; \
|
||||
const uint32_t nb02 = octx->src0.nb[2]; \
|
||||
const uint32_t nb03 = octx->src0.nb[3]; \
|
||||
\
|
||||
const uint32_t nb10 = octx->src1.nb[0]; \
|
||||
const uint32_t nb11 = octx->src1.nb[1]; \
|
||||
const uint32_t nb12 = octx->src1.nb[2]; \
|
||||
\
|
||||
const uint32_t nb1 = octx->dst.nb[1]; \
|
||||
const uint32_t nb2 = octx->dst.nb[2]; \
|
||||
const uint32_t nb3 = octx->dst.nb[3]; \
|
||||
\
|
||||
const uint32_t nr = ne10 * ne11 * ne12;
|
||||
|
||||
static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
get_rows_preamble;
|
||||
|
||||
// parallelize by src1 elements (which correspond to dst rows)
|
||||
const uint32_t dr = octx->src1_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
|
||||
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
|
||||
|
||||
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||
const uint32_t i12 = fastdiv(i, &octx->get_rows_div_ne10_ne11);
|
||||
const uint32_t rem = i - i12 * ne11 * ne10;
|
||||
const uint32_t i11 = fastdiv(rem, &octx->get_rows_div_ne10);
|
||||
const uint32_t i10 = rem - i11 * ne10;
|
||||
|
||||
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||
|
||||
uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||
|
||||
if (i01 >= ne01) {
|
||||
// invalid index, skip for now to avoid crash
|
||||
continue;
|
||||
}
|
||||
|
||||
const uintptr_t src0_ptr = octx->src0.data + i01*nb01 + i11*nb02 + i12*nb03;
|
||||
const uintptr_t dst_ptr = octx->dst.data + i10*nb1 + i11*nb2 + i12*nb3;
|
||||
hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
static void get_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
|
||||
get_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
|
||||
}
|
||||
|
||||
int op_get_rows(struct htp_ops_context * octx) {
|
||||
get_rows_preamble;
|
||||
|
||||
if (octx->src0.type != HTP_TYPE_F32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->dst.type != HTP_TYPE_F32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->src1.type != HTP_TYPE_I32 && octx->src1.type != HTP_TYPE_I64) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
octx->get_rows_div_ne10 = init_fastdiv_values(octx->src1.ne[0]);
|
||||
octx->get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
|
||||
|
||||
const uint32_t n_jobs = MIN(nr, octx->n_threads);
|
||||
octx->src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, get_rows_work_f32_f32, octx, n_jobs);
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -11,6 +11,11 @@
|
||||
|
||||
#define HTP_MAX_NTHREADS 10
|
||||
|
||||
// FIXME: move these into matmul-ops
|
||||
#define HTP_SPAD_SRC0_NROWS 16
|
||||
#define HTP_SPAD_SRC1_NROWS 16
|
||||
#define HTP_SPAD_DST_NROWS 2
|
||||
|
||||
// Main context for htp DSP backend
|
||||
struct htp_context {
|
||||
dspqueue_t queue;
|
||||
|
||||
@@ -36,8 +36,6 @@ enum htp_data_type {
|
||||
HTP_TYPE_F16 = 1,
|
||||
HTP_TYPE_Q4_0 = 2,
|
||||
HTP_TYPE_Q8_0 = 8,
|
||||
HTP_TYPE_I32 = 26,
|
||||
HTP_TYPE_I64 = 27,
|
||||
HTP_TYPE_MXFP4 = 39,
|
||||
HTP_TYPE_COUNT
|
||||
};
|
||||
@@ -59,10 +57,6 @@ enum htp_op {
|
||||
HTP_OP_SOFTMAX = 11,
|
||||
HTP_OP_ADD_ID = 12,
|
||||
HTP_OP_ROPE = 13,
|
||||
HTP_OP_FLASH_ATTN_EXT = 14,
|
||||
HTP_OP_SET_ROWS = 15,
|
||||
HTP_OP_SCALE = 16,
|
||||
HTP_OP_GET_ROWS = 17,
|
||||
INVALID
|
||||
};
|
||||
|
||||
@@ -143,8 +137,6 @@ struct htp_general_req {
|
||||
struct htp_tensor src0; // Input0 tensor
|
||||
struct htp_tensor src1; // Input1 tensor
|
||||
struct htp_tensor src2; // Input2 tensor
|
||||
struct htp_tensor src3; // Input3 tensor
|
||||
struct htp_tensor src4; // Input4 tensor
|
||||
struct htp_tensor dst; // Output tensor
|
||||
|
||||
// should be multiple of 64 bytes (cacheline)
|
||||
@@ -160,6 +152,6 @@ struct htp_general_rsp {
|
||||
};
|
||||
|
||||
#define HTP_MAX_MESSAGE_SIZE sizeof(struct htp_general_req)
|
||||
#define HTP_MAX_PACKET_BUFFERS 8
|
||||
#define HTP_MAX_PACKET_BUFFERS 4
|
||||
|
||||
#endif /* HTP_MSG_H */
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
|
||||
struct htp_spad {
|
||||
uint8_t * data;
|
||||
size_t stride;
|
||||
size_t size;
|
||||
size_t size_per_thread;
|
||||
};
|
||||
@@ -27,14 +26,11 @@ struct htp_ops_context {
|
||||
struct htp_tensor src0;
|
||||
struct htp_tensor src1;
|
||||
struct htp_tensor src2;
|
||||
struct htp_tensor src3;
|
||||
struct htp_tensor src4;
|
||||
struct htp_tensor dst;
|
||||
|
||||
struct htp_spad src0_spad;
|
||||
struct htp_spad src1_spad;
|
||||
struct htp_spad src2_spad;
|
||||
struct htp_spad src3_spad;
|
||||
struct htp_spad dst_spad;
|
||||
|
||||
worker_pool_context_t * wpool; // worker pool
|
||||
@@ -53,27 +49,6 @@ struct htp_ops_context {
|
||||
struct fastdiv_values src1_div3; // fastdiv values for ne3
|
||||
struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1
|
||||
|
||||
struct fastdiv_values src3_div1; // fastdiv values for ne1
|
||||
struct fastdiv_values src3_div2; // fastdiv values for ne2
|
||||
struct fastdiv_values src3_div3; // fastdiv values for ne3
|
||||
struct fastdiv_values src3_div21; // fastdiv values for ne2 * ne1
|
||||
|
||||
struct fastdiv_values broadcast_rk2;
|
||||
struct fastdiv_values broadcast_rk3;
|
||||
struct fastdiv_values broadcast_rv2;
|
||||
struct fastdiv_values broadcast_rv3;
|
||||
|
||||
struct fastdiv_values mm_div_ne12_ne1; // fastdiv values for ne12 * ne1
|
||||
struct fastdiv_values mm_div_ne1; // fastdiv values for ne1
|
||||
struct fastdiv_values mm_div_r2; // fastdiv values for ne12 / ne02
|
||||
struct fastdiv_values mm_div_r3; // fastdiv values for ne13 / ne03
|
||||
|
||||
struct fastdiv_values set_rows_div_ne12; // fastdiv values for ne12
|
||||
struct fastdiv_values set_rows_div_ne11; // fastdiv values for ne11
|
||||
|
||||
struct fastdiv_values get_rows_div_ne10; // fastdiv values for ne10
|
||||
struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11
|
||||
|
||||
uint32_t flags;
|
||||
};
|
||||
|
||||
@@ -85,8 +60,5 @@ int op_activations(struct htp_ops_context * octx);
|
||||
int op_softmax(struct htp_ops_context * octx);
|
||||
int op_add_id(struct htp_ops_context * octx);
|
||||
int op_rope(struct htp_ops_context * octx);
|
||||
int op_flash_attn_ext(struct htp_ops_context * octx);
|
||||
int op_set_rows(struct htp_ops_context * octx);
|
||||
int op_get_rows(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_OPS_H */
|
||||
|
||||
@@ -848,6 +848,55 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
|
||||
return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
|
||||
}
|
||||
|
||||
void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale) {
|
||||
int left_over = num_elems & (VLEN_FP32 - 1);
|
||||
int num_elems_whole = num_elems - left_over;
|
||||
|
||||
int unaligned_addr = 0;
|
||||
int unaligned_loop = 0;
|
||||
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
|
||||
FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
|
||||
unaligned_addr = 1;
|
||||
}
|
||||
|
||||
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
|
||||
unaligned_loop = 1;
|
||||
FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
|
||||
}
|
||||
|
||||
HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
|
||||
|
||||
if (0 == unaligned_loop) {
|
||||
HVX_Vector * vec_in1 = (HVX_Vector *) src;
|
||||
HVX_Vector * vec_out = (HVX_Vector *) dst;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
|
||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||
}
|
||||
} else {
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||
|
||||
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
|
||||
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
||||
}
|
||||
}
|
||||
|
||||
if (left_over > 0) {
|
||||
const float * srcf = (const float *) src + num_elems_whole;
|
||||
float * dstf = (float *) dst + num_elems_whole;
|
||||
|
||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
||||
|
||||
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
|
||||
}
|
||||
}
|
||||
|
||||
float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
|
||||
int left_over = num_elems & (VLEN_FP32 - 1);
|
||||
int num_elems_whole = num_elems - left_over;
|
||||
@@ -1016,5 +1065,3 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -41,24 +41,15 @@ static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline HVX_Vector hvx_vec_splat_fp32(float v) {
|
||||
static inline HVX_Vector hvx_vec_splat_fp32(float i) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} fp32 = { .f = v };
|
||||
float f;
|
||||
int32_t i;
|
||||
} fp32 = { .f = i };
|
||||
|
||||
return Q6_V_vsplat_R(fp32.i);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_splat_fp16(float v) {
|
||||
union {
|
||||
__fp16 f;
|
||||
uint16_t i;
|
||||
} fp16 = { .f = v };
|
||||
|
||||
return Q6_Vh_vsplat_R(fp16.i);
|
||||
}
|
||||
|
||||
static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) {
|
||||
// Rotate as needed.
|
||||
v = Q6_V_vlalign_VVR(v, v, (size_t) addr);
|
||||
@@ -251,120 +242,6 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest
|
||||
}
|
||||
}
|
||||
|
||||
// copy n fp32 elements : source is unaligned, destination unaligned
|
||||
static inline void hvx_copy_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
|
||||
HVX_UVector * restrict vdst = (HVX_UVector *) dst;
|
||||
HVX_UVector * restrict vsrc = (HVX_UVector *) src;
|
||||
|
||||
assert((unsigned long) dst % 128 == 0);
|
||||
|
||||
uint32_t nvec = n / 32;
|
||||
uint32_t nloe = n % 32;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (; i < nvec; i++) {
|
||||
HVX_Vector v = vsrc[i];
|
||||
vdst[i] = v;
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = vsrc[i];
|
||||
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
|
||||
}
|
||||
}
|
||||
|
||||
// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned
|
||||
static inline void hvx_copy_fp16_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
|
||||
HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16
|
||||
HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32
|
||||
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
|
||||
uint32_t nvec = n / 64;
|
||||
uint32_t nloe = n % 64;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (; i < nvec; i++) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||
vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
|
||||
}
|
||||
}
|
||||
|
||||
// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned
|
||||
static inline void hvx_copy_fp16_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
|
||||
HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16
|
||||
HVX_Vector * restrict vsrc = (HVX_Vector *) src; // fp32
|
||||
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
|
||||
uint32_t nvec = n / 64;
|
||||
uint32_t nloe = n % 64;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (; i < nvec; i++) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||
vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
|
||||
}
|
||||
}
|
||||
|
||||
// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned
|
||||
static inline void hvx_copy_fp16_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
|
||||
HVX_Vector * restrict vdst = (HVX_Vector *) dst; // fp16
|
||||
HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32
|
||||
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
|
||||
uint32_t nvec = n / 64;
|
||||
uint32_t nloe = n % 64;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (; i < nvec; i++) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||
vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
|
||||
}
|
||||
}
|
||||
|
||||
// bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned
|
||||
static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) {
|
||||
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
|
||||
@@ -396,6 +273,8 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
|
||||
return right_off <= chunk_size;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
|
||||
HVX_VectorAlias u = { .v = v };
|
||||
|
||||
@@ -652,13 +531,13 @@ static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
|
||||
#if __HVX_ARCH__ > 75
|
||||
#if __HTP_ARCH__ > 75
|
||||
return Q6_Vsf_vfneg_Vsf(v);
|
||||
#else
|
||||
// neg by setting the fp32 sign bit
|
||||
HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
|
||||
return Q6_V_vxor_VV(v, mask);
|
||||
#endif // __HVX_ARCH__ > 75
|
||||
#endif // __HTP_ARCH__ > 75
|
||||
}
|
||||
|
||||
// ====================================================
|
||||
@@ -1097,24 +976,6 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
|
||||
return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_tanh_fp32(HVX_Vector x) {
|
||||
// tanh(x) = 2 * sigmoid(2x) - 1
|
||||
HVX_Vector two = hvx_vec_splat_fp32(2.0f);
|
||||
HVX_Vector one = hvx_vec_splat_fp32(1.0f);
|
||||
HVX_Vector x2 = Q6_Vqf32_vmpy_VsfVsf(x, two);
|
||||
|
||||
static const float kMinExp = -87.f; // 0
|
||||
static const float kMaxExp = 87.f; // 1
|
||||
HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||
HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
||||
|
||||
HVX_Vector sig2x = hvx_vec_fast_sigmoid_fp32_guard(Q6_Vsf_equals_Vqf32(x2), one, max_exp, min_exp);
|
||||
|
||||
HVX_Vector res = Q6_Vqf32_vmpy_VsfVsf(sig2x, two);
|
||||
res = Q6_Vqf32_vsub_Vqf32Vsf(res, one);
|
||||
return Q6_Vsf_equals_Vqf32(res);
|
||||
}
|
||||
|
||||
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
||||
int step_of_1 = num_elems >> 5;
|
||||
int remaining = num_elems - step_of_1 * VLEN_FP32;
|
||||
@@ -1195,115 +1056,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
|
||||
int nvec = n / VLEN_FP32;
|
||||
int nloe = n % VLEN_FP32;
|
||||
|
||||
HVX_Vector vs = hvx_vec_splat_fp32(scale);
|
||||
|
||||
HVX_Vector * vsrc = (HVX_Vector *) src;
|
||||
HVX_Vector * vdst = (HVX_Vector *) dst;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; ++i) {
|
||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||
vdst[i] = Q6_Vsf_equals_Vqf32(v);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||
hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
|
||||
int nvec = n / VLEN_FP32;
|
||||
int nloe = n % VLEN_FP32;
|
||||
|
||||
HVX_Vector vs = hvx_vec_splat_fp32(scale);
|
||||
|
||||
HVX_UVector * vsrc = (HVX_UVector *) src;
|
||||
HVX_UVector * vdst = (HVX_UVector *) dst;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; ++i) {
|
||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||
vdst[i] = Q6_Vsf_equals_Vqf32(v);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||
hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
|
||||
if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) {
|
||||
hvx_scale_f32_aa(dst, src, n, scale);
|
||||
} else {
|
||||
hvx_scale_f32_uu(dst, src, n, scale);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
|
||||
int nvec = n / VLEN_FP32;
|
||||
int nloe = n % VLEN_FP32;
|
||||
|
||||
HVX_Vector vs = hvx_vec_splat_fp32(scale);
|
||||
HVX_Vector vo = hvx_vec_splat_fp32(offset);
|
||||
|
||||
HVX_Vector * vsrc = (HVX_Vector *) src;
|
||||
HVX_Vector * vdst = (HVX_Vector *) dst;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; ++i) {
|
||||
HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
|
||||
vdst[i] = Q6_Vsf_equals_Vqf32(v);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
|
||||
hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
|
||||
int nvec = n / VLEN_FP32;
|
||||
int nloe = n % VLEN_FP32;
|
||||
|
||||
HVX_Vector vs = hvx_vec_splat_fp32(scale);
|
||||
HVX_Vector vo = hvx_vec_splat_fp32(offset);
|
||||
|
||||
HVX_UVector * vsrc = (HVX_UVector *) src;
|
||||
HVX_UVector * vdst = (HVX_UVector *) dst;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; ++i) {
|
||||
HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
|
||||
vdst[i] = Q6_Vsf_equals_Vqf32(v);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
|
||||
hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
|
||||
if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) {
|
||||
hvx_scale_offset_f32_aa(dst, src, n, scale, offset);
|
||||
} else {
|
||||
hvx_scale_offset_f32_uu(dst, src, n, scale, offset);
|
||||
}
|
||||
}
|
||||
|
||||
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
|
||||
void hvx_mul_f32(const uint8_t * restrict src0,
|
||||
@@ -1338,6 +1090,7 @@ void hvx_sub_f32_opt(const uint8_t * restrict src0,
|
||||
uint8_t * restrict dst,
|
||||
const int num_elems);
|
||||
void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
|
||||
void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale);
|
||||
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
|
||||
void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
|
||||
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate);
|
||||
|
||||
@@ -443,45 +443,6 @@ static void proc_matmul_req(struct htp_context * ctx,
|
||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
||||
struct dspqueue_buffer rsp_bufs[1];
|
||||
|
||||
// We had written to the output buffer, we'd also need to flush it
|
||||
rsp_bufs[0].fd = bufs[2].fd;
|
||||
rsp_bufs[0].ptr = bufs[2].ptr;
|
||||
rsp_bufs[0].offset = bufs[2].offset;
|
||||
rsp_bufs[0].size = bufs[2].size;
|
||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||
|
||||
// Setup Op context
|
||||
struct htp_ops_context octx = { 0 };
|
||||
octx.ctx = ctx;
|
||||
octx.src0 = req->src0;
|
||||
octx.src1 = req->src1;
|
||||
octx.dst = req->dst;
|
||||
octx.flags = req->flags;
|
||||
octx.op = req->op;
|
||||
|
||||
// Update data pointers
|
||||
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||
octx.src1.data = (uint32_t) bufs[1].ptr;
|
||||
octx.dst.data = (uint32_t) bufs[2].ptr;
|
||||
octx.n_threads = ctx->n_threads;
|
||||
|
||||
struct profile_data prof;
|
||||
profile_start(&prof);
|
||||
|
||||
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||
rsp_status = op_get_rows(&octx);
|
||||
vtcm_release(ctx);
|
||||
}
|
||||
|
||||
profile_stop(&prof);
|
||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
static void proc_matmul_id_req(struct htp_context * ctx,
|
||||
struct htp_general_req * req,
|
||||
struct dspqueue_buffer * bufs,
|
||||
@@ -707,7 +668,7 @@ static void proc_rope_req(struct htp_context * ctx,
|
||||
uint32_t n_bufs) {
|
||||
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
|
||||
|
||||
int write_idx = n_bufs - 1;
|
||||
int write_idx = (n_bufs == 4) ? 3 : 2;
|
||||
|
||||
// We had written to the output buffer, we'd also need to flush it
|
||||
rsp_bufs[0].fd = bufs[write_idx].fd;
|
||||
@@ -755,102 +716,6 @@ static void proc_rope_req(struct htp_context * ctx,
|
||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
||||
struct dspqueue_buffer rsp_bufs[1];
|
||||
|
||||
// We had written to the output buffer, we'd also need to flush it
|
||||
rsp_bufs[0].fd = bufs[2].fd;
|
||||
rsp_bufs[0].ptr = bufs[2].ptr;
|
||||
rsp_bufs[0].offset = bufs[2].offset;
|
||||
rsp_bufs[0].size = bufs[2].size;
|
||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||
|
||||
// Setup Op context
|
||||
struct htp_ops_context octx = { 0 };
|
||||
octx.ctx = ctx;
|
||||
octx.src0 = req->src0;
|
||||
octx.src1 = req->src1;
|
||||
octx.dst = req->dst;
|
||||
octx.flags = req->flags;
|
||||
octx.op = req->op;
|
||||
|
||||
// Update data pointers
|
||||
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||
octx.src1.data = (uint32_t) bufs[1].ptr;
|
||||
octx.dst.data = (uint32_t) bufs[2].ptr;
|
||||
octx.n_threads = ctx->n_threads;
|
||||
|
||||
struct profile_data prof;
|
||||
profile_start(&prof);
|
||||
|
||||
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||
rsp_status = op_set_rows(&octx);
|
||||
vtcm_release(ctx);
|
||||
}
|
||||
|
||||
profile_stop(&prof);
|
||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
static void proc_flash_attn_ext_req(struct htp_context * ctx,
|
||||
struct htp_general_req * req,
|
||||
struct dspqueue_buffer * bufs,
|
||||
uint32_t n_bufs) {
|
||||
// Setup Op context
|
||||
struct htp_ops_context octx;
|
||||
memset(&octx, 0, sizeof(octx));
|
||||
|
||||
octx.ctx = ctx;
|
||||
octx.n_threads = ctx->n_threads;
|
||||
|
||||
octx.src0 = req->src0;
|
||||
octx.src1 = req->src1;
|
||||
octx.src2 = req->src2;
|
||||
octx.src3 = req->src3;
|
||||
octx.src4 = req->src4;
|
||||
octx.dst = req->dst;
|
||||
octx.flags = req->flags;
|
||||
octx.op = req->op;
|
||||
|
||||
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
||||
|
||||
// Update data pointers
|
||||
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||
octx.src1.data = (uint32_t) bufs[1].ptr;
|
||||
octx.src2.data = (uint32_t) bufs[2].ptr;
|
||||
|
||||
int last_buf = 3;
|
||||
|
||||
if (octx.src3.ne[0]) {
|
||||
octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
|
||||
}
|
||||
|
||||
if (octx.src4.ne[0]) {
|
||||
octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid
|
||||
}
|
||||
|
||||
octx.dst.data = (uint32_t) bufs[last_buf].ptr;
|
||||
|
||||
struct profile_data prof;
|
||||
profile_start(&prof);
|
||||
|
||||
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||
rsp_status = op_flash_attn_ext(&octx);
|
||||
vtcm_release(ctx);
|
||||
}
|
||||
|
||||
profile_stop(&prof);
|
||||
|
||||
struct dspqueue_buffer rsp_buf = bufs[last_buf];
|
||||
rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||
|
||||
send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
|
||||
}
|
||||
|
||||
static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
struct htp_context * ctx = (struct htp_context *) context;
|
||||
|
||||
@@ -925,7 +790,6 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
break;
|
||||
|
||||
case HTP_OP_RMS_NORM:
|
||||
case HTP_OP_SCALE:
|
||||
if (n_bufs != 2) {
|
||||
FARF(ERROR, "Bad unary-req buffer list");
|
||||
continue;
|
||||
@@ -969,30 +833,6 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
proc_rope_req(ctx, &req, bufs, n_bufs);
|
||||
break;
|
||||
|
||||
case HTP_OP_FLASH_ATTN_EXT:
|
||||
if (!(n_bufs >= 4 && n_bufs <= 6)) {
|
||||
FARF(ERROR, "Bad flash-attn-ext-req buffer list");
|
||||
continue;
|
||||
}
|
||||
proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
|
||||
break;
|
||||
|
||||
case HTP_OP_SET_ROWS:
|
||||
if (n_bufs != 3) {
|
||||
FARF(ERROR, "Bad set-rows-req buffer list");
|
||||
continue;
|
||||
}
|
||||
proc_set_rows_req(ctx, &req, bufs);
|
||||
break;
|
||||
|
||||
case HTP_OP_GET_ROWS:
|
||||
if (n_bufs != 3) {
|
||||
FARF(ERROR, "Bad get-rows-req buffer list");
|
||||
continue;
|
||||
}
|
||||
proc_get_rows_req(ctx, &req, bufs);
|
||||
break;
|
||||
|
||||
default:
|
||||
FARF(ERROR, "Unknown Op %u", req.op);
|
||||
break;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,168 +0,0 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#ifdef HTP_DEBUG
|
||||
# define FARF_HIGH 1
|
||||
#endif
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_mem.h>
|
||||
#include <HAP_perf.h>
|
||||
#include <hexagon_protos.h>
|
||||
#include <hexagon_types.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-msg.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "ops-utils.h"
|
||||
|
||||
#define set_rows_preamble \
|
||||
const uint32_t ne00 = octx->src0.ne[0]; \
|
||||
const uint32_t ne01 = octx->src0.ne[1]; \
|
||||
const uint32_t ne02 = octx->src0.ne[2]; \
|
||||
const uint32_t ne03 = octx->src0.ne[3]; \
|
||||
\
|
||||
const uint32_t ne10 = octx->src1.ne[0]; \
|
||||
const uint32_t ne11 = octx->src1.ne[1]; \
|
||||
const uint32_t ne12 = octx->src1.ne[2]; \
|
||||
\
|
||||
const uint32_t nb01 = octx->src0.nb[1]; \
|
||||
const uint32_t nb02 = octx->src0.nb[2]; \
|
||||
const uint32_t nb03 = octx->src0.nb[3]; \
|
||||
\
|
||||
const uint32_t nb10 = octx->src1.nb[0]; \
|
||||
const uint32_t nb11 = octx->src1.nb[1]; \
|
||||
const uint32_t nb12 = octx->src1.nb[2]; \
|
||||
\
|
||||
const uint32_t nb1 = octx->dst.nb[1]; \
|
||||
const uint32_t nb2 = octx->dst.nb[2]; \
|
||||
const uint32_t nb3 = octx->dst.nb[3]; \
|
||||
\
|
||||
const uint32_t ne1 = octx->dst.ne[1]; \
|
||||
\
|
||||
const uint32_t nr = ne01;
|
||||
|
||||
static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
set_rows_preamble;
|
||||
|
||||
// parallelize by rows of src0
|
||||
const uint32_t dr = octx->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
|
||||
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
|
||||
|
||||
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
|
||||
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
|
||||
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||
const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
|
||||
const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
|
||||
const uint32_t i10 = i;
|
||||
|
||||
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||
|
||||
uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||
if (i1 >= ne1) {
|
||||
// ignore invalid indices
|
||||
continue;
|
||||
}
|
||||
|
||||
const uintptr_t src0_ptr = octx->src0.data + i*nb01 + i02*nb02 + i03*nb03;
|
||||
const uintptr_t dst_ptr = octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3;
|
||||
|
||||
// copy row
|
||||
hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
set_rows_preamble;
|
||||
|
||||
// parallelize by rows of src0
|
||||
const uint32_t dr = octx->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
|
||||
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
|
||||
|
||||
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
|
||||
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
|
||||
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||
const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
|
||||
const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
|
||||
const uint32_t i10 = i;
|
||||
|
||||
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||
|
||||
uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||
if (i1 >= ne1) {
|
||||
// ignore invalid indices
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint8_t* src0_ptr = (const uint8_t *) octx->src0.data + i*nb01 + i02*nb02 + i03*nb03;
|
||||
uint8_t* dst_ptr = (uint8_t *) octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3;
|
||||
|
||||
hvx_copy_fp16_fp32_uu(dst_ptr, src0_ptr, ne00);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
static void set_rows_work_f16_f32(unsigned int n, unsigned int i, void *data) {
|
||||
set_rows_thread_f16_f32((struct htp_ops_context *) data, n, i);
|
||||
}
|
||||
|
||||
static void set_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
|
||||
set_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
|
||||
}
|
||||
|
||||
int op_set_rows(struct htp_ops_context * octx) {
|
||||
set_rows_preamble;
|
||||
|
||||
if (octx->src0.type != HTP_TYPE_F32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->dst.type != HTP_TYPE_F32 && octx->dst.type != HTP_TYPE_F16) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->src1.type != HTP_TYPE_I32 && octx->src1.type != HTP_TYPE_I64) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
octx->set_rows_div_ne12 = init_fastdiv_values(ne12);
|
||||
octx->set_rows_div_ne11 = init_fastdiv_values(ne11);
|
||||
|
||||
const uint32_t n_jobs = MIN(nr, octx->n_threads);
|
||||
octx->src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
|
||||
|
||||
switch(octx->dst.type) {
|
||||
case HTP_TYPE_F32:
|
||||
worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f32_f32, octx, n_jobs);
|
||||
break;
|
||||
case HTP_TYPE_F16:
|
||||
worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f16_f32, octx, n_jobs);
|
||||
break;
|
||||
default:
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -238,7 +238,7 @@ static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ct
|
||||
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
|
||||
(const uint8_t *) mp_f32, slope);
|
||||
} else {
|
||||
hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, softmax_ctx->scale);
|
||||
hvx_scale_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale);
|
||||
if (mp_f32) {
|
||||
if (softmax_ctx->use_f16) {
|
||||
for (int i = 0; i < ne00; ++i) {
|
||||
@@ -258,7 +258,7 @@ static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ct
|
||||
float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
|
||||
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
|
||||
sum = sum > 0.0 ? (1.0 / sum) : 1;
|
||||
hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
|
||||
hvx_scale_f32((const uint8_t *) wp2, (uint8_t *) dp, ne00, sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,31 +83,6 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
|
||||
}
|
||||
}
|
||||
|
||||
static void scale_htp_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params,
|
||||
int opt_path) {
|
||||
float scale = 0.f;
|
||||
float bias = 0.f;
|
||||
memcpy(&scale, &op_params[0], sizeof(float));
|
||||
memcpy(&bias, &op_params[1], sizeof(float));
|
||||
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const float * restrict src_local = src + (ir * row_elems);
|
||||
float * restrict dst_local = dst + (ir * row_elems);
|
||||
|
||||
if (ir + 1 < num_rows) {
|
||||
htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
|
||||
}
|
||||
|
||||
hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_htp_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
@@ -135,7 +110,7 @@ static void rms_norm_htp_f32(const float * restrict src,
|
||||
const float mean = sum / row_elems;
|
||||
const float scale = 1.0f / sqrtf(mean + epsilon);
|
||||
|
||||
hvx_scale_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale);
|
||||
hvx_scale_f32((const uint8_t *) src_local, (uint8_t *) dst_local, row_elems, scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -187,9 +162,6 @@ static void unary_job_f32_per_thread(const struct htp_tensor * src,
|
||||
case HTP_OP_RMS_NORM:
|
||||
rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
|
||||
break;
|
||||
case HTP_OP_SCALE:
|
||||
scale_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
@@ -223,10 +195,6 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
unary_op_func = unary_job_dispatcher_f32;
|
||||
op_type = "rmsnorm-f32";
|
||||
break;
|
||||
case HTP_OP_SCALE:
|
||||
unary_op_func = unary_job_dispatcher_f32;
|
||||
op_type = "scale-f32";
|
||||
break;
|
||||
|
||||
default:
|
||||
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
||||
|
||||
@@ -219,8 +219,6 @@ struct ggml_metal_device_props {
|
||||
bool use_shared_buffers;
|
||||
|
||||
bool supports_gpu_family_apple7;
|
||||
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(void);
|
||||
|
||||
@@ -782,8 +782,6 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
|
||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
|
||||
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
||||
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
||||
|
||||
@@ -625,11 +625,14 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return (op->op == GGML_OP_MUL_MAT ||
|
||||
op->op == GGML_OP_MUL_MAT_ID) &&
|
||||
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
|
||||
get_op_batch_size(op) >= min_batch_size;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(op);
|
||||
}
|
||||
|
||||
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
||||
|
||||
@@ -9148,7 +9148,6 @@ typedef decltype(kernel_mul_mm_id_map0<1>) kernel_mul_mm_id_map0_t;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_5" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<5>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
|
||||
|
||||
@@ -57,7 +57,6 @@ set(GGML_OPENCL_KERNELS
|
||||
add
|
||||
add_id
|
||||
argsort
|
||||
fill
|
||||
clamp
|
||||
cpy
|
||||
cvt
|
||||
|
||||
@@ -489,7 +489,6 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
||||
cl_kernel kernel_relu;
|
||||
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
||||
cl_kernel kernel_fill;
|
||||
cl_kernel kernel_clamp;
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
|
||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
||||
@@ -788,24 +787,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// fill
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "fill.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("fill.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_fill = clCreateKernel(prog, "kernel_fill_f32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
}
|
||||
|
||||
// clamp
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3123,8 +3104,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_FILL:
|
||||
return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
|
||||
case GGML_OP_CLAMP:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
@@ -5881,36 +5860,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
UNUSED(src0);
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
float v = 0.0f;
|
||||
memcpy(&v, ((int32_t *) dst->op_params), sizeof(float));
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_fill;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(float), &v));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(float), &n));
|
||||
|
||||
size_t local_work_size[1] = { 256 };
|
||||
size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -9646,12 +9595,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_glu;
|
||||
break;
|
||||
case GGML_OP_FILL:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_fill;
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// fill
|
||||
//------------------------------------------------------------------------------
|
||||
__kernel void kernel_fill_f32(
|
||||
__global float *dst,
|
||||
ulong offsetd,
|
||||
float v,
|
||||
int n
|
||||
|
||||
) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
if(get_global_id(0) < n){
|
||||
dst[get_global_id(0)] = v;
|
||||
}
|
||||
}
|
||||
+398
-163
@@ -6,6 +6,9 @@
|
||||
#include <cinttypes>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include <condition_variable>
|
||||
#include <future>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
@@ -30,6 +33,8 @@
|
||||
#include <fstream>
|
||||
#include <filesystem>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <thread>
|
||||
|
||||
static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
|
||||
|
||||
@@ -107,6 +112,7 @@ enum rpc_cmd {
|
||||
RPC_CMD_HELLO,
|
||||
RPC_CMD_DEVICE_COUNT,
|
||||
RPC_CMD_GRAPH_RECOMPUTE,
|
||||
RPC_CMD_NONE,
|
||||
RPC_CMD_COUNT,
|
||||
};
|
||||
|
||||
@@ -261,17 +267,18 @@ struct graph_cache {
|
||||
std::vector<ggml_tensor> last_graph;
|
||||
};
|
||||
|
||||
class rpc_dispatcher;
|
||||
struct ggml_backend_rpc_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
std::string name;
|
||||
graph_cache gc;
|
||||
std::shared_ptr<rpc_dispatcher> dispatcher;
|
||||
uint32_t device;
|
||||
std::string name;
|
||||
graph_cache gc;
|
||||
};
|
||||
|
||||
struct ggml_backend_rpc_buffer_context {
|
||||
std::shared_ptr<socket_t> sock;
|
||||
void * base_ptr;
|
||||
uint64_t remote_ptr;
|
||||
std::shared_ptr<rpc_dispatcher> dispatcher;
|
||||
void * base_ptr;
|
||||
uint64_t remote_ptr;
|
||||
};
|
||||
|
||||
// RPC helper functions
|
||||
@@ -495,67 +502,267 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||
|
||||
// RPC client-side implementation
|
||||
|
||||
static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
||||
template <typename T>
|
||||
class message_queue {
|
||||
public:
|
||||
message_queue() {}
|
||||
|
||||
bool push(T value) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
if (interrupted) {
|
||||
return false;
|
||||
}
|
||||
queue.push(std::move(value));
|
||||
cvar.notify_all();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool pop(T& out) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
cvar.wait(lock, [this] { return !queue.empty() || interrupted; });
|
||||
if (interrupted) {
|
||||
return false;
|
||||
}
|
||||
out = std::move(queue.front());
|
||||
queue.pop();
|
||||
return true;
|
||||
}
|
||||
|
||||
void interrupt() {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
interrupted = true;
|
||||
lock.unlock();
|
||||
cvar.notify_all();
|
||||
}
|
||||
|
||||
private:
|
||||
bool interrupted = false;
|
||||
std::queue<T> queue;
|
||||
std::mutex mutex;
|
||||
std::condition_variable cvar;
|
||||
};
|
||||
|
||||
class rpc_dispatcher {
|
||||
public:
|
||||
rpc_dispatcher() {
|
||||
}
|
||||
|
||||
void send(enum rpc_cmd cmd, std::shared_ptr<const void> input, size_t input_size);
|
||||
void send(enum rpc_cmd cmd, std::shared_ptr<const void> input, size_t input_size, void * output, size_t output_size);
|
||||
void send_async(enum rpc_cmd cmd, std::shared_ptr<const void> input, size_t input_size);
|
||||
void send_async(enum rpc_cmd cmd, std::shared_ptr<const void> input, size_t input_size, void * output, size_t output_size);
|
||||
|
||||
ggml_backend_event_t event_new(ggml_backend_dev_t dev);
|
||||
void event_free(ggml_backend_event_t event);
|
||||
void event_synchronize(ggml_backend_event_t event);
|
||||
void event_record(ggml_backend_event_t event);
|
||||
void synchronize();
|
||||
|
||||
void start(const std::string & endpoint);
|
||||
void work();
|
||||
|
||||
~rpc_dispatcher();
|
||||
|
||||
private:
|
||||
struct rpc_msg {
|
||||
rpc_cmd cmd;
|
||||
std::shared_ptr<const void> input;
|
||||
size_t input_size;
|
||||
void * output;
|
||||
size_t output_size;
|
||||
std::promise<void> completion;
|
||||
};
|
||||
using rpc_msg_ptr = std::unique_ptr<rpc_msg>;
|
||||
using rpc_msg_queue = message_queue<rpc_msg_ptr>;
|
||||
struct rpc_event {
|
||||
rpc_msg_ptr msg;
|
||||
std::shared_future<void> sf;
|
||||
};
|
||||
rpc_msg_queue queue;
|
||||
std::shared_ptr<socket_t> sock;
|
||||
std::atomic_bool running;
|
||||
std::thread thread;
|
||||
};
|
||||
|
||||
static void rpc_dispatcher_trampoline(rpc_dispatcher * dispatcher)
|
||||
{
|
||||
dispatcher->work();
|
||||
}
|
||||
|
||||
void rpc_dispatcher::send(enum rpc_cmd cmd, std::shared_ptr<const void> input, size_t input_size) {
|
||||
auto msg = std::make_unique<rpc_msg>();
|
||||
msg->cmd = cmd;
|
||||
msg->input = input;
|
||||
msg->input_size = input_size;
|
||||
msg->output = nullptr;
|
||||
msg->output_size = 0;
|
||||
GGML_ASSERT(queue.push(msg));
|
||||
auto future = msg->completion.get_future();
|
||||
future.wait();
|
||||
}
|
||||
|
||||
void rpc_dispatcher::send_async(enum rpc_cmd cmd, std::shared_ptr<const void> input, size_t input_size) {
|
||||
auto msg = std::make_unique<rpc_msg>();
|
||||
msg->cmd = cmd;
|
||||
msg->input = input;
|
||||
msg->input_size = input_size;
|
||||
msg->output = nullptr;
|
||||
msg->output_size = 0;
|
||||
GGML_ASSERT(queue.push(msg));
|
||||
}
|
||||
|
||||
void rpc_dispatcher::send(enum rpc_cmd cmd, std::shared_ptr<const void> input, size_t input_size, void * output, size_t output_size) {
|
||||
auto msg = std::make_unique<rpc_msg>();
|
||||
msg->cmd = cmd;
|
||||
msg->input = input;
|
||||
msg->input_size = input_size;
|
||||
msg->output = output;
|
||||
msg->output_size = output_size;
|
||||
GGML_ASSERT(queue.push(msg));
|
||||
auto future = msg->completion.get_future();
|
||||
future.wait();
|
||||
}
|
||||
|
||||
void rpc_dispatcher::send_async(enum rpc_cmd cmd, std::shared_ptr<const void> input, size_t input_size, void * output, size_t output_size) {
|
||||
auto msg = std::make_unique<rpc_msg>();
|
||||
msg->cmd = cmd;
|
||||
msg->input = input;
|
||||
msg->input_size = input_size;
|
||||
msg->output = output;
|
||||
msg->output_size = output_size;
|
||||
GGML_ASSERT(queue.push(msg));
|
||||
}
|
||||
|
||||
ggml_backend_event_t rpc_dispatcher::event_new(ggml_backend_dev_t dev) {
|
||||
rpc_event * ev = new rpc_event;
|
||||
ev->msg = std::make_unique<rpc_msg>();
|
||||
ev->msg->cmd = RPC_CMD_NONE;
|
||||
ev->sf = ev->msg->completion.get_future().share();
|
||||
GGML_ASSERT(queue.push(ev->msg));
|
||||
return new ggml_backend_event {
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ev,
|
||||
};
|
||||
}
|
||||
|
||||
void rpc_dispatcher::event_free(ggml_backend_event_t event) {
|
||||
rpc_event * ev = (rpc_event *)event->context;
|
||||
delete ev;
|
||||
}
|
||||
|
||||
void rpc_dispatcher::event_synchronize(ggml_backend_event_t event) {
|
||||
rpc_event * ev = (rpc_event *)event->context;
|
||||
ev->sf.wait();
|
||||
}
|
||||
|
||||
void rpc_dispatcher::event_record(ggml_backend_event_t event) {
|
||||
rpc_event * ev = (rpc_event *)event->context;
|
||||
ev->msg = std::make_unique<rpc_msg>();
|
||||
ev->msg->cmd = RPC_CMD_NONE;
|
||||
ev->sf = ev->msg->completion.get_future().share();
|
||||
GGML_ASSERT(queue.push(ev->msg));
|
||||
}
|
||||
|
||||
void rpc_dispatcher::synchronize() {
|
||||
// to ensure all messages are processed, submit dummy message and wait for it to complete
|
||||
auto msg = std::make_unique<rpc_msg>();
|
||||
msg->cmd = RPC_CMD_NONE;
|
||||
GGML_ASSERT(queue.push(msg));
|
||||
msg->completion.get_future().wait();
|
||||
}
|
||||
|
||||
static void check_server_version(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_hello_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
||||
GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
return false;
|
||||
GGML_ABORT("RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
}
|
||||
if (response.minor != RPC_PROTO_MINOR_VERSION || response.patch != RPC_PROTO_PATCH_VERSION) {
|
||||
GGML_LOG_INFO("WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
static std::unordered_map<std::string, std::weak_ptr<socket_t>> sockets;
|
||||
static bool initialized = false;
|
||||
|
||||
auto it = sockets.find(endpoint);
|
||||
if (it != sockets.end()) {
|
||||
if (auto sock = it->second.lock()) {
|
||||
return sock;
|
||||
}
|
||||
}
|
||||
void rpc_dispatcher::start(const std::string & endpoint) {
|
||||
static bool win32_init = false;
|
||||
std::string host;
|
||||
int port;
|
||||
if (!parse_endpoint(endpoint, host, port)) {
|
||||
GGML_LOG_ERROR("Failed to parse endpoint: %s\n", endpoint.c_str());
|
||||
return nullptr;
|
||||
GGML_ABORT("Failed to parse endpoint: %s\n", endpoint.c_str());
|
||||
}
|
||||
#ifdef _WIN32
|
||||
if (!initialized) {
|
||||
if (!win32_init) {
|
||||
WSADATA wsaData;
|
||||
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
||||
if (res != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
initialized = true;
|
||||
win32_init = true;
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(initialized);
|
||||
GGML_UNUSED(win32_init);
|
||||
#endif
|
||||
auto sock = socket_connect(host.c_str(), port);
|
||||
sock = socket_connect(host.c_str(), port);
|
||||
if (sock == nullptr) {
|
||||
return nullptr;
|
||||
GGML_ABORT("Failed to connect to %s\n", endpoint.c_str());
|
||||
}
|
||||
if (!check_server_version(sock)) {
|
||||
return nullptr;
|
||||
check_server_version(sock);
|
||||
LOG_DBG("[rpc_dispatcher] connected to %s, sockfd=%d\n", endpoint.c_str(), sock->fd);
|
||||
running = true;
|
||||
thread = std::thread(rpc_dispatcher_trampoline, this);
|
||||
}
|
||||
|
||||
void rpc_dispatcher::work() {
|
||||
while (running) {
|
||||
rpc_msg_ptr msg_ptr;
|
||||
if (!queue.pop(msg_ptr)) {
|
||||
break;
|
||||
}
|
||||
if (msg_ptr->cmd != RPC_CMD_NONE) {
|
||||
if (msg_ptr->output) {
|
||||
bool status = send_rpc_cmd(sock, msg_ptr->cmd, msg_ptr->input.get(), msg_ptr->input_size, msg_ptr->output, msg_ptr->output_size);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
} else {
|
||||
bool status = send_rpc_cmd(sock, msg_ptr->cmd, msg_ptr->input.get(), msg_ptr->input_size);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
}
|
||||
msg_ptr->completion.set_value();
|
||||
}
|
||||
LOG_DBG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||
sockets[endpoint] = sock;
|
||||
return sock;
|
||||
}
|
||||
|
||||
rpc_dispatcher::~rpc_dispatcher() {
|
||||
running = false;
|
||||
queue.interrupt();
|
||||
sock = nullptr;
|
||||
if (thread.joinable()) {
|
||||
thread.join();
|
||||
}
|
||||
}
|
||||
|
||||
static std::shared_ptr<rpc_dispatcher> get_dispatcher(const std::string & endpoint) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
static std::unordered_map<std::string, std::weak_ptr<rpc_dispatcher>> dispatchers;
|
||||
|
||||
auto it = dispatchers.find(endpoint);
|
||||
if (it != dispatchers.end()) {
|
||||
if (auto dispatcher = it->second.lock()) {
|
||||
return dispatcher;
|
||||
}
|
||||
}
|
||||
|
||||
auto dispatcher = std::make_shared<rpc_dispatcher>();
|
||||
dispatcher->start(endpoint);
|
||||
dispatchers[endpoint] = dispatcher;
|
||||
return dispatcher;
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
auto request = std::make_shared<rpc_msg_free_buffer_req>();
|
||||
request->remote_ptr = ctx->remote_ptr;
|
||||
ctx->dispatcher->send(RPC_CMD_FREE_BUFFER, request, sizeof(*request));
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -564,10 +771,10 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
if (ctx->base_ptr != nullptr) {
|
||||
return ctx->base_ptr;
|
||||
}
|
||||
rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
|
||||
auto request = std::make_shared<rpc_msg_buffer_get_base_req>();
|
||||
request->remote_ptr = ctx->remote_ptr;
|
||||
rpc_msg_buffer_get_base_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
ctx->dispatcher->send(RPC_CMD_BUFFER_GET_BASE, request, sizeof(*request), &response, sizeof(response));
|
||||
ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
|
||||
return ctx->base_ptr;
|
||||
}
|
||||
@@ -623,12 +830,9 @@ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_
|
||||
// Due to bandwidth constraints, we only call the server init tensor functions if necessary.
|
||||
// In particular, only quantized tensors need padding
|
||||
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
|
||||
rpc_msg_init_tensor_req request;
|
||||
|
||||
request.tensor = serialize_tensor(tensor);
|
||||
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
auto request = std::make_shared<rpc_msg_init_tensor_req>();
|
||||
request->tensor = serialize_tensor(tensor);
|
||||
ctx->dispatcher->send(RPC_CMD_INIT_TENSOR, request, sizeof(*request));
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -637,13 +841,12 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_tensor rpc_tensor = serialize_tensor(tensor);
|
||||
if (size > HASH_THRESHOLD) {
|
||||
rpc_msg_set_tensor_hash_req request;
|
||||
request.tensor = rpc_tensor;
|
||||
request.offset = offset;
|
||||
request.hash = fnv_hash((const uint8_t*)data, size);
|
||||
auto request = std::make_shared<rpc_msg_set_tensor_hash_req>();
|
||||
request->tensor = rpc_tensor;
|
||||
request->offset = offset;
|
||||
request->hash = fnv_hash((const uint8_t*)data, size);
|
||||
rpc_msg_set_tensor_hash_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
ctx->dispatcher->send(RPC_CMD_SET_TENSOR_HASH, request, sizeof(*request), &response, sizeof(response));
|
||||
if (response.result) {
|
||||
// the server has the same data, no need to send it
|
||||
return;
|
||||
@@ -651,22 +854,56 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
||||
}
|
||||
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes)
|
||||
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
|
||||
std::vector<uint8_t> input(input_size, 0);
|
||||
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
||||
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
|
||||
RPC_STATUS_ASSERT(status);
|
||||
uint8_t * input = new uint8_t[input_size]();
|
||||
memcpy(input, &rpc_tensor, sizeof(rpc_tensor));
|
||||
memcpy(input + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
||||
std::shared_ptr<uint8_t> input_ptr(input, std::default_delete<uint8_t[]>());
|
||||
ctx->dispatcher->send(RPC_CMD_SET_TENSOR, input_ptr, input_size);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
rpc_tensor rpc_tensor = serialize_tensor(tensor);
|
||||
if (size > HASH_THRESHOLD) {
|
||||
auto request = std::make_shared<rpc_msg_set_tensor_hash_req>();
|
||||
request->tensor = rpc_tensor;
|
||||
request->offset = offset;
|
||||
request->hash = fnv_hash((const uint8_t*)data, size);
|
||||
rpc_msg_set_tensor_hash_rsp response;
|
||||
// TODO: make this async
|
||||
ctx->dispatcher->send(RPC_CMD_SET_TENSOR_HASH, request, sizeof(*request), &response, sizeof(response));
|
||||
if (response.result) {
|
||||
// the server has the same data, no need to send it
|
||||
return;
|
||||
}
|
||||
}
|
||||
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes)
|
||||
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
|
||||
uint8_t * input = new uint8_t[input_size]();
|
||||
memcpy(input, &rpc_tensor, sizeof(rpc_tensor));
|
||||
memcpy(input + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
||||
std::shared_ptr<uint8_t> input_ptr(input, std::default_delete<uint8_t[]>());
|
||||
ctx->dispatcher->send_async(RPC_CMD_SET_TENSOR, input_ptr, input_size);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_get_tensor_req request;
|
||||
request.tensor = serialize_tensor(tensor);
|
||||
request.offset = offset;
|
||||
request.size = size;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
auto request = std::make_shared<rpc_msg_get_tensor_req>();
|
||||
request->tensor = serialize_tensor(tensor);
|
||||
request->offset = offset;
|
||||
request->size = size;
|
||||
ctx->dispatcher->send(RPC_CMD_GET_TENSOR, request, sizeof(*request), data, size);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
auto request = std::make_shared<rpc_msg_get_tensor_req>();
|
||||
request->tensor = serialize_tensor(tensor);
|
||||
request->offset = offset;
|
||||
request->size = size;
|
||||
ctx->dispatcher->send_async(RPC_CMD_GET_TENSOR, request, sizeof(*request), data, size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
@@ -676,16 +913,15 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
||||
ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
|
||||
ggml_backend_buffer_t dst_buffer = dst->buffer;
|
||||
ggml_backend_rpc_buffer_context * dst_ctx = (ggml_backend_rpc_buffer_context *)dst_buffer->context;
|
||||
if (src_ctx->sock != dst_ctx->sock) {
|
||||
if (src_ctx->dispatcher != dst_ctx->dispatcher) {
|
||||
return false;
|
||||
}
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_copy_tensor_req request;
|
||||
request.src = serialize_tensor(src);
|
||||
request.dst = serialize_tensor(dst);
|
||||
auto request = std::make_shared<rpc_msg_copy_tensor_req>();
|
||||
request->src = serialize_tensor(src);
|
||||
request->dst = serialize_tensor(dst);
|
||||
rpc_msg_copy_tensor_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
ctx->dispatcher->send(RPC_CMD_COPY_TENSOR, request, sizeof(*request), &response, sizeof(response));
|
||||
return response.result;
|
||||
}
|
||||
return false;
|
||||
@@ -693,9 +929,10 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
||||
|
||||
static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
auto request = std::make_shared<rpc_msg_buffer_clear_req>();
|
||||
request->remote_ptr = ctx->remote_ptr;
|
||||
request->value = value;
|
||||
ctx->dispatcher->send(RPC_CMD_BUFFER_CLEAR, request, sizeof(*request));
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
||||
@@ -717,15 +954,17 @@ static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
rpc_msg_alloc_buffer_req request = {buft_ctx->device, size};
|
||||
auto request = std::make_shared<rpc_msg_alloc_buffer_req>();
|
||||
request->device = buft_ctx->device;
|
||||
request->size = size;
|
||||
rpc_msg_alloc_buffer_rsp response;
|
||||
auto sock = get_socket(buft_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
|
||||
auto dispatcher = get_dispatcher(buft_ctx->endpoint);
|
||||
dispatcher->send(RPC_CMD_ALLOC_BUFFER, request, sizeof(*request), &response, sizeof(response));
|
||||
if (response.remote_ptr != 0) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||
ggml_backend_rpc_buffer_interface,
|
||||
new ggml_backend_rpc_buffer_context{sock, nullptr, response.remote_ptr},
|
||||
new ggml_backend_rpc_buffer_context{dispatcher, nullptr, response.remote_ptr},
|
||||
response.remote_size);
|
||||
return buffer;
|
||||
} else {
|
||||
@@ -733,11 +972,11 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
||||
}
|
||||
}
|
||||
|
||||
static size_t get_alignment(const std::shared_ptr<socket_t> & sock, uint32_t device) {
|
||||
rpc_msg_get_alignment_req request = {device};
|
||||
static size_t get_alignment(const std::shared_ptr<rpc_dispatcher> & dispatcher, uint32_t device) {
|
||||
auto request = std::make_shared<rpc_msg_get_alignment_req>();
|
||||
request->device = device;
|
||||
rpc_msg_get_alignment_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
dispatcher->send(RPC_CMD_GET_ALIGNMENT, request, sizeof(*request), &response, sizeof(response));
|
||||
return response.alignment;
|
||||
}
|
||||
|
||||
@@ -746,11 +985,11 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
|
||||
return buft_ctx->alignment;
|
||||
}
|
||||
|
||||
static size_t get_max_size(const std::shared_ptr<socket_t> & sock, uint32_t device) {
|
||||
rpc_msg_get_max_size_req request = {device};
|
||||
static size_t get_max_size(const std::shared_ptr<rpc_dispatcher> & dispatcher, uint32_t device) {
|
||||
auto request = std::make_shared<rpc_msg_get_max_size_req>();
|
||||
request->device = device;
|
||||
rpc_msg_get_max_size_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
dispatcher->send(RPC_CMD_GET_MAX_SIZE, request, sizeof(*request), &response, sizeof(response));
|
||||
return response.max_size;
|
||||
}
|
||||
|
||||
@@ -773,23 +1012,20 @@ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
||||
|
||||
if (rpc_get) {
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
auto sock = get_socket(buft_ctx->endpoint);
|
||||
auto dispatcher = get_dispatcher(buft_ctx->endpoint);
|
||||
|
||||
rpc_msg_get_alloc_size_req request = {
|
||||
/*.device =*/ buft_ctx->device,
|
||||
/*.tensor =*/ serialize_tensor(tensor),
|
||||
/*.srcs =*/ {},
|
||||
};
|
||||
auto request = std::make_shared<rpc_msg_get_alloc_size_req>();
|
||||
request->device = buft_ctx->device;
|
||||
request->tensor = serialize_tensor(tensor);
|
||||
|
||||
// .get_alloc_size could be a function of the tensor's srcs, so we must serialize them as well
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
request.srcs[i] = serialize_tensor(tensor->src[i]);
|
||||
request->srcs[i] = serialize_tensor(tensor->src[i]);
|
||||
}
|
||||
|
||||
// TODO: cache the alloc responses to avoid extra RPC calls?
|
||||
rpc_msg_get_alloc_size_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
dispatcher->send(RPC_CMD_GET_ALLOC_SIZE, request, sizeof(*request), &response, sizeof(response));
|
||||
|
||||
return response.alloc_size;
|
||||
}
|
||||
@@ -819,8 +1055,8 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||
GGML_UNUSED(backend);
|
||||
// this is no-op because we don't have any async operations
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
rpc_ctx->dispatcher->synchronize();
|
||||
}
|
||||
|
||||
static void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited) {
|
||||
@@ -838,7 +1074,7 @@ static void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors,
|
||||
tensors.push_back(serialize_tensor(tensor));
|
||||
}
|
||||
|
||||
static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
|
||||
static uint8_t * serialize_graph(uint32_t device, const ggml_cgraph * cgraph, size_t * output_size) {
|
||||
uint32_t n_nodes = cgraph->n_nodes;
|
||||
std::vector<rpc_tensor> tensors;
|
||||
std::unordered_set<ggml_tensor*> visited;
|
||||
@@ -848,9 +1084,9 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
|
||||
// serialization format:
|
||||
// | device (4 bytes) | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
|
||||
uint32_t n_tensors = tensors.size();
|
||||
int output_size = 2*sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
|
||||
output.resize(output_size, 0);
|
||||
uint8_t * dest = output.data();
|
||||
*output_size = 2*sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
|
||||
uint8_t * output = new uint8_t[*output_size]();
|
||||
uint8_t * dest = output;
|
||||
memcpy(dest, &device, sizeof(device));
|
||||
dest += sizeof(device);
|
||||
memcpy(dest, &n_nodes, sizeof(n_nodes));
|
||||
@@ -863,6 +1099,7 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
|
||||
dest += sizeof(n_tensors);
|
||||
rpc_tensor * out_tensors = (rpc_tensor *)dest;
|
||||
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
|
||||
return output;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
@@ -871,27 +1108,34 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
||||
GGML_ASSERT(cgraph->n_nodes > 0);
|
||||
bool reuse = rpc_ctx->gc.is_cached(cgraph);
|
||||
if (reuse) {
|
||||
rpc_msg_graph_recompute_req request;
|
||||
request.device = rpc_ctx->device;
|
||||
auto sock = get_socket(rpc_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
auto request = std::make_shared<rpc_msg_graph_recompute_req>();
|
||||
request->device = rpc_ctx->device;
|
||||
rpc_ctx->dispatcher->send_async(RPC_CMD_GRAPH_RECOMPUTE, request, sizeof(*request));
|
||||
} else {
|
||||
rpc_ctx->gc.add(cgraph);
|
||||
std::vector<uint8_t> input;
|
||||
serialize_graph(rpc_ctx->device, cgraph, input);
|
||||
auto sock = get_socket(rpc_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size());
|
||||
RPC_STATUS_ASSERT(status);
|
||||
size_t input_size = 0;
|
||||
uint8_t * input = serialize_graph(rpc_ctx->device, cgraph, &input_size);
|
||||
std::shared_ptr<uint8_t> input_ptr(input, std::default_delete<uint8_t[]>());
|
||||
rpc_ctx->dispatcher->send_async(RPC_CMD_GRAPH_COMPUTE, input_ptr, input_size);
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
rpc_ctx->dispatcher->event_record(event);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_event_wait(ggml_backend_t dev, ggml_backend_event_t event) {
|
||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)dev->context;
|
||||
ctx->dispatcher->event_synchronize(event);
|
||||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_rpc_interface = {
|
||||
/* .get_name = */ ggml_backend_rpc_name,
|
||||
/* .free = */ ggml_backend_rpc_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .set_tensor_async = */ ggml_backend_rpc_buffer_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_rpc_buffer_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_rpc_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
@@ -899,8 +1143,8 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .event_record = */ ggml_backend_rpc_event_record,
|
||||
/* .event_wait = */ ggml_backend_rpc_event_wait,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
@@ -914,13 +1158,9 @@ ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, u
|
||||
if (it != buft_map.end()) {
|
||||
return it->second;
|
||||
}
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
GGML_LOG_ERROR("Failed to connect to %s\n", endpoint);
|
||||
return nullptr;
|
||||
}
|
||||
size_t alignment = get_alignment(sock, device);
|
||||
size_t max_size = get_max_size(sock, device);
|
||||
auto dispatcher = get_dispatcher(endpoint);
|
||||
size_t alignment = get_alignment(dispatcher, device);
|
||||
size_t max_size = get_max_size(dispatcher, device);
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .device = */ device,
|
||||
@@ -940,11 +1180,12 @@ ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, u
|
||||
|
||||
ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
|
||||
std::string dev_name = "RPC" + std::to_string(device) + "[" + std::string(endpoint) + "]";
|
||||
auto dispatcher = get_dispatcher(endpoint);
|
||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .device = */ device,
|
||||
/* .name = */ dev_name,
|
||||
/* .gc = */ {},
|
||||
/* .dispatcher = */ dispatcher,
|
||||
/* .device = */ device,
|
||||
/* .name = */ dev_name,
|
||||
/* .gc = */ {},
|
||||
};
|
||||
auto reg = ggml_backend_rpc_add_server(endpoint);
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
@@ -960,26 +1201,16 @@ bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
||||
}
|
||||
|
||||
static void get_device_memory(const std::shared_ptr<socket_t> & sock, uint32_t device, size_t * free, size_t * total) {
|
||||
rpc_msg_get_device_memory_req request;
|
||||
request.device = device;
|
||||
void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total) {
|
||||
auto dispatcher = get_dispatcher(endpoint);
|
||||
auto request = std::make_shared<rpc_msg_get_device_memory_req>();
|
||||
request->device = device;
|
||||
rpc_msg_get_device_memory_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
dispatcher->send(RPC_CMD_GET_DEVICE_MEMORY, request, sizeof(*request), &response, sizeof(response));
|
||||
*free = response.free_mem;
|
||||
*total = response.total_mem;
|
||||
}
|
||||
|
||||
void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total) {
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
return;
|
||||
}
|
||||
get_device_memory(sock, device, free, total);
|
||||
}
|
||||
|
||||
// RPC server-side implementation
|
||||
|
||||
class rpc_server {
|
||||
@@ -1701,9 +1932,6 @@ static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const
|
||||
if (!server.free_buffer(request)) {
|
||||
return;
|
||||
}
|
||||
if (!send_msg(sockfd, nullptr, 0)) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RPC_CMD_BUFFER_CLEAR: {
|
||||
@@ -1714,9 +1942,6 @@ static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const
|
||||
if (!server.buffer_clear(request)) {
|
||||
return;
|
||||
}
|
||||
if (!send_msg(sockfd, nullptr, 0)) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RPC_CMD_SET_TENSOR: {
|
||||
@@ -1751,9 +1976,6 @@ static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const
|
||||
if (!server.init_tensor(request)) {
|
||||
return;
|
||||
}
|
||||
if (!send_msg(sockfd, nullptr, 0)) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RPC_CMD_GET_TENSOR: {
|
||||
@@ -1941,10 +2163,10 @@ static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggm
|
||||
props->type = ggml_backend_rpc_device_get_type(dev);
|
||||
ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
/* .events = */ true,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1980,6 +2202,24 @@ static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_b
|
||||
return buft_ctx->endpoint == dev_ctx->endpoint && buft_ctx->device == dev_ctx->device;
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_rpc_device_event_new(ggml_backend_dev_t dev) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
auto dispatcher = get_dispatcher(ctx->endpoint.c_str());
|
||||
return dispatcher->event_new(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
auto dispatcher = get_dispatcher(ctx->endpoint.c_str());
|
||||
dispatcher->event_free(event);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
auto dispatcher = get_dispatcher(ctx->endpoint.c_str());
|
||||
dispatcher->event_synchronize(event);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||
/* .get_name = */ ggml_backend_rpc_device_get_name,
|
||||
/* .get_description = */ ggml_backend_rpc_device_get_description,
|
||||
@@ -1993,9 +2233,9 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
/* .event_new = */ ggml_backend_rpc_device_event_new,
|
||||
/* .event_free = */ ggml_backend_rpc_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_rpc_device_event_synchronize,
|
||||
};
|
||||
|
||||
// backend reg interface
|
||||
@@ -2055,14 +2295,9 @@ ggml_backend_reg_t ggml_backend_rpc_reg(void) {
|
||||
}
|
||||
|
||||
static uint32_t ggml_backend_rpc_get_device_count(const char * endpoint) {
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
GGML_LOG_ERROR("Failed to connect to %s\n", endpoint);
|
||||
return 0;
|
||||
}
|
||||
auto dispatcher = get_dispatcher(endpoint);
|
||||
rpc_msg_device_count_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_DEVICE_COUNT, nullptr, 0, &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
dispatcher->send(RPC_CMD_DEVICE_COUNT, nullptr, 0, &response, sizeof(response));
|
||||
return response.device_count;
|
||||
}
|
||||
|
||||
|
||||
@@ -4286,7 +4286,6 @@ struct ggml_backend_sycl_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -4675,8 +4674,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
|
||||
return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
|
||||
const int min_batch_size = 32;
|
||||
return get_op_batch_size(op) >= min_batch_size;
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_event_t
|
||||
@@ -4799,7 +4799,6 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||
ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
|
||||
@@ -4813,7 +4812,6 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
||||
prop, dpct::dev_mgr::instance().get_device(i))));
|
||||
|
||||
dev_ctx->description = prop.get_name();
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_sycl_device_interface,
|
||||
|
||||
@@ -550,8 +550,6 @@ struct vk_device_struct {
|
||||
uint64_t max_memory_allocation_size;
|
||||
uint64_t max_buffer_size;
|
||||
uint64_t suballocation_block_size;
|
||||
uint64_t min_imported_host_pointer_alignment;
|
||||
bool external_memory_host {};
|
||||
bool fp16;
|
||||
bool bf16;
|
||||
bool pipeline_robustness;
|
||||
@@ -570,7 +568,6 @@ struct vk_device_struct {
|
||||
bool uma;
|
||||
bool prefer_host_memory;
|
||||
bool float_controls_rte_fp16;
|
||||
bool subgroup_basic;
|
||||
bool subgroup_arithmetic;
|
||||
bool subgroup_shuffle;
|
||||
bool subgroup_ballot;
|
||||
@@ -1505,11 +1502,6 @@ template <> void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) {
|
||||
init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L);
|
||||
}
|
||||
|
||||
struct vk_quantize_q8_1_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
struct vk_staging_memcpy {
|
||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||
@@ -2418,8 +2410,7 @@ static std::vector<uint32_t> ggml_vk_find_memory_properties(const vk::PhysicalDe
|
||||
return indices;
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list,
|
||||
void *import_ptr = nullptr) {
|
||||
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")");
|
||||
if (size > device->max_buffer_size) {
|
||||
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit");
|
||||
@@ -2448,12 +2439,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||
nullptr,
|
||||
};
|
||||
|
||||
vk::ExternalMemoryBufferCreateInfo external_memory_bci;
|
||||
if (import_ptr) {
|
||||
external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
|
||||
buffer_create_info.setPNext(&external_memory_bci);
|
||||
}
|
||||
|
||||
buf->buffer = device->device.createBuffer(buffer_create_info);
|
||||
|
||||
vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer);
|
||||
@@ -2468,80 +2453,35 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||
mem_flags_info.setPNext(&mem_priority_info);
|
||||
}
|
||||
|
||||
if (import_ptr) {
|
||||
vk::MemoryHostPointerPropertiesEXT host_pointer_props;
|
||||
try {
|
||||
host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, import_ptr);
|
||||
} catch (vk::SystemError& e) {
|
||||
GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what());
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
return {};
|
||||
for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
|
||||
const auto & req_flags = *it;
|
||||
|
||||
const std::vector<uint32_t> memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags);
|
||||
|
||||
if (memory_type_indices.empty()) {
|
||||
continue;
|
||||
}
|
||||
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
|
||||
buf->memory_property_flags = req_flags;
|
||||
|
||||
uint32_t memory_type_idx;
|
||||
vk::MemoryPropertyFlags property_flags = *req_flags_list.begin();
|
||||
for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) {
|
||||
if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) {
|
||||
continue;
|
||||
}
|
||||
if (!(mem_req.memoryTypeBits & (1u << memory_type_idx))) {
|
||||
continue;
|
||||
}
|
||||
bool done = false;
|
||||
|
||||
vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx];
|
||||
// check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed
|
||||
if ((memory_type.propertyFlags & property_flags) == property_flags) {
|
||||
property_flags = memory_type.propertyFlags;
|
||||
for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
|
||||
try {
|
||||
buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (memory_type_idx == 32) {
|
||||
GGML_LOG_WARN("ggml_vulkan: Memory type for host allocation not found\n");
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
return {};
|
||||
}
|
||||
|
||||
buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags;
|
||||
try {
|
||||
vk::ImportMemoryHostPointerInfoEXT import_info;
|
||||
import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
|
||||
import_info.pHostPointer = import_ptr;
|
||||
import_info.setPNext(&mem_flags_info);
|
||||
buf->device_memory = device->device.allocateMemory({ size, memory_type_idx, &import_info });
|
||||
} catch (const vk::SystemError& e) {
|
||||
}
|
||||
} else {
|
||||
for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
|
||||
const auto & req_flags = *it;
|
||||
|
||||
const std::vector<uint32_t> memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags);
|
||||
|
||||
if (memory_type_indices.empty()) {
|
||||
continue;
|
||||
}
|
||||
buf->memory_property_flags = req_flags;
|
||||
|
||||
bool done = false;
|
||||
|
||||
for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
|
||||
try {
|
||||
buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
|
||||
done = true;
|
||||
break;
|
||||
} catch (const vk::SystemError& e) {
|
||||
// loop and retry
|
||||
// during last attempt throw the exception
|
||||
if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
throw e;
|
||||
}
|
||||
} catch (const vk::SystemError& e) {
|
||||
// loop and retry
|
||||
// during last attempt throw the exception
|
||||
if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2552,12 +2492,8 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||
|
||||
buf->ptr = nullptr;
|
||||
|
||||
if (import_ptr) {
|
||||
buf->ptr = import_ptr;
|
||||
} else {
|
||||
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
||||
}
|
||||
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
||||
}
|
||||
|
||||
device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
|
||||
@@ -3002,10 +2938,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
|
||||
m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
|
||||
m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) {
|
||||
// Xe2/Xe3 with coopmat enabled - warptile performance tuning
|
||||
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
}
|
||||
|
||||
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
|
||||
@@ -3346,12 +3278,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
GGML_ASSERT(device->subgroup_ballot);
|
||||
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
||||
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
||||
if (device->coopmat_bf16_support) {
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3459,9 +3391,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
#endif
|
||||
|
||||
if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
|
||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
@@ -3503,9 +3435,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
|
||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
@@ -3620,9 +3552,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
#endif
|
||||
|
||||
if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
|
||||
CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
@@ -3646,9 +3578,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_subgroup_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_subgroup_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
} else {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
|
||||
CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
@@ -3688,11 +3620,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
m_wg_denoms = { 64, 64, 1 };
|
||||
s_wg_denoms = { 32, 32, 1 };
|
||||
|
||||
if (device->vendor_id == VK_VENDOR_ID_INTEL && device->architecture == INTEL_XE2) {
|
||||
// Xe2/Xe3 - bf16 warptile performance tuning
|
||||
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, 4, 4, 1, subgroup_size_8 };
|
||||
}
|
||||
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
}
|
||||
@@ -3846,22 +3773,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size;
|
||||
const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||
}
|
||||
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
|
||||
}
|
||||
@@ -3949,9 +3876,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
|
||||
if (device->subgroup_clustered && device->subgroup_require_full_support) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
||||
@@ -4159,9 +4086,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
#define CREATE_GLU(name) \
|
||||
if (device->float_controls_rte_fp16) { \
|
||||
@@ -4307,8 +4234,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
|
||||
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size}, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size}, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
|
||||
@@ -4520,8 +4447,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
} else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 &&
|
||||
getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) {
|
||||
device->memory_priority = true;
|
||||
} else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) {
|
||||
device->external_memory_host = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4536,7 +4461,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
||||
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props;
|
||||
vk::PhysicalDeviceExternalMemoryHostPropertiesEXT external_memory_host_props;
|
||||
|
||||
props2.pNext = &props3;
|
||||
props3.pNext = &subgroup_props;
|
||||
@@ -4576,22 +4500,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props;
|
||||
}
|
||||
|
||||
if (device->external_memory_host) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&external_memory_host_props;
|
||||
last_struct = (VkBaseOutStructure *)&external_memory_host_props;
|
||||
}
|
||||
|
||||
device->physical_device.getProperties2(&props2);
|
||||
device->properties = props2.properties;
|
||||
device->vendor_id = device->properties.vendorID;
|
||||
device->driver_id = driver_props.driverID;
|
||||
|
||||
if (device->driver_id == vk::DriverId::eMoltenvk) {
|
||||
// Disable external_memory_host until https://github.com/KhronosGroup/MoltenVK/pull/2622
|
||||
// is available in the Vulkan SDK.
|
||||
device->external_memory_host = false;
|
||||
}
|
||||
|
||||
// Implementing the async backend interfaces seems broken on older Intel HW,
|
||||
// see https://github.com/ggml-org/llama.cpp/issues/17302.
|
||||
device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL ||
|
||||
@@ -4644,8 +4557,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
}
|
||||
device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
|
||||
|
||||
device->subgroup_basic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBasic);
|
||||
device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
|
||||
#ifdef __APPLE__
|
||||
@@ -4675,8 +4586,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
||||
device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated;
|
||||
|
||||
device->min_imported_host_pointer_alignment = external_memory_host_props.minImportedHostPointerAlignment;
|
||||
|
||||
device->max_workgroup_size_log2 = uint32_t(log2f(float(device->properties.limits.maxComputeWorkGroupInvocations)));
|
||||
|
||||
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
|
||||
@@ -4808,10 +4717,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device_extensions.push_back("VK_KHR_pipeline_executable_properties");
|
||||
}
|
||||
|
||||
if (device->external_memory_host) {
|
||||
device_extensions.push_back("VK_EXT_external_memory_host");
|
||||
}
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
|
||||
|
||||
device->pipeline_executable_properties_support = pipeline_executable_properties_support;
|
||||
@@ -5078,23 +4983,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
switch (device->vendor_id) {
|
||||
#ifndef GGML_VULKAN_RUN_TESTS
|
||||
case VK_VENDOR_ID_AMD:
|
||||
device->mul_mat_l[i] = false;
|
||||
device->mul_mat_m[i] = true;
|
||||
device->mul_mat_s[i] = true;
|
||||
device->mul_mat_id_l[i] = false;
|
||||
device->mul_mat_id_m[i] = true;
|
||||
device->mul_mat_id_s[i] = true;
|
||||
break;
|
||||
case VK_VENDOR_ID_INTEL:
|
||||
if (!device->coopmat_support || device->architecture != INTEL_XE2) {
|
||||
device->mul_mat_l[i] = false;
|
||||
device->mul_mat_id_l[i] = false;
|
||||
} else {
|
||||
device->mul_mat_l[i] = true; // if coopmat & XE2+, allow large matmul warptile config for Intel
|
||||
device->mul_mat_id_l[i] = true;
|
||||
}
|
||||
device->mul_mat_l[i] = false;
|
||||
device->mul_mat_m[i] = true;
|
||||
device->mul_mat_s[i] = true;
|
||||
device->mul_mat_id_l[i] = false;
|
||||
device->mul_mat_id_m[i] = true;
|
||||
device->mul_mat_id_s[i] = true;
|
||||
break;
|
||||
@@ -6105,7 +5998,6 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
||||
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
||||
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
||||
GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
|
||||
GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants));
|
||||
|
||||
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
||||
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
||||
@@ -6888,12 +6780,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
const uint64_t max_elements = std::min<uint64_t>(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits<uint32_t>::max());
|
||||
const uint32_t elements = std::min(ne, static_cast<uint32_t>(max_elements));
|
||||
|
||||
const vk_quantize_q8_1_push_constants pc = {
|
||||
ne,
|
||||
num_blocks,
|
||||
};
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, { elements, 1, 1 });
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 2>{ ne, num_blocks }, { elements, 1, 1 });
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
@@ -9884,9 +9771,8 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
|
||||
std::array<uint32_t, 3> elements;
|
||||
|
||||
const uint32_t d_state = src0->ne[0];
|
||||
uint32_t num_subgroups = d_state / ctx->device->subgroup_size;
|
||||
const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups);
|
||||
const int splitH = 16;
|
||||
const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, splitH);
|
||||
const uint32_t num_workgroups_y = n_seq;
|
||||
elements = { num_workgroups_x, num_workgroups_y, 1 };
|
||||
|
||||
@@ -14264,7 +14150,6 @@ struct ggml_backend_vk_device_context {
|
||||
std::string description;
|
||||
bool is_integrated_gpu;
|
||||
std::string pci_bus_id;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -14321,19 +14206,6 @@ static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
|
||||
// reject any tensors larger than the max buffer size
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && ggml_nbytes(op->src[i]) > device->max_buffer_size) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (ggml_nbytes(op) > device->max_buffer_size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
@@ -14382,6 +14254,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
if (op->op == GGML_OP_MUL_MAT_ID) {
|
||||
if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
|
||||
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
|
||||
@@ -14442,6 +14316,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
}
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
bool coopmat2 = device->coopmat2;
|
||||
uint32_t HSK = op->src[1]->ne[0];
|
||||
uint32_t HSV = op->src[2]->ne[0];
|
||||
@@ -14663,6 +14539,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
|
||||
return false;
|
||||
}
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
// pipeline_argsort_large_f32 requires vulkan memory model.
|
||||
if (device->vulkan_memory_model) {
|
||||
return true;
|
||||
@@ -14675,6 +14553,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
|
||||
return false;
|
||||
}
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
// We could potentially support larger, using argsort to sort the
|
||||
// whole thing. Not clear if this is needed.
|
||||
uint32_t min_pipeline = (uint32_t)log2f(float(op->ne[0])) + 1;
|
||||
@@ -14721,6 +14601,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]);
|
||||
case GGML_OP_CUMSUM:
|
||||
{
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]);
|
||||
}
|
||||
@@ -14728,6 +14610,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
}
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
{
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
|
||||
if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
@@ -14792,13 +14677,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t shmem_size = d_state * sizeof(float);
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
|
||||
if (shmem_size > device->properties.limits.maxComputeSharedMemorySize) {
|
||||
return false;
|
||||
}
|
||||
const uint32_t SPLIT_H = 16;
|
||||
|
||||
if (!device->subgroup_basic) {
|
||||
size_t stateC_size = SPLIT_H * d_state * sizeof(float);
|
||||
|
||||
if (stateC_size > device->properties.limits.maxComputeSharedMemorySize) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -14838,10 +14724,12 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
|
||||
@@ -14885,51 +14773,6 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm
|
||||
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) {
|
||||
if (!device->external_memory_host) {
|
||||
return {};
|
||||
}
|
||||
|
||||
uintptr_t uptr = reinterpret_cast<uintptr_t>(ptr);
|
||||
if (uptr & (device->min_imported_host_pointer_alignment - 1)) {
|
||||
return {};
|
||||
}
|
||||
if (size & (device->min_imported_host_pointer_alignment - 1)) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached;
|
||||
|
||||
vk_buffer buf {};
|
||||
try {
|
||||
buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr);
|
||||
} catch (vk::SystemError& e) {
|
||||
GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what());
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")");
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
|
||||
vk_buffer buf = ggml_vk_buffer_from_host_ptr(device, ptr, size);
|
||||
|
||||
if (!buf) {
|
||||
return {};
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(device, std::move(buf), device->name);
|
||||
|
||||
ggml_backend_buffer_t ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
||||
/* .get_name = */ ggml_backend_vk_device_get_name,
|
||||
/* .get_description = */ ggml_backend_vk_device_get_description,
|
||||
@@ -14939,7 +14782,7 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
||||
/* .init_backend = */ ggml_backend_vk_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_vk_device_buffer_from_host_ptr,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_vk_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_vk_device_offload_op,
|
||||
@@ -14967,7 +14810,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
@@ -14977,7 +14819,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
ctx->op_offload_min_batch_size = min_batch_size;
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
|
||||
@@ -462,8 +462,7 @@ vec2 get_dm(uint ib, uint a_offset) {
|
||||
|
||||
#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
|
||||
vec2 get_dm(uint ib, uint a_offset) {
|
||||
const vec2 dm = vec2(data_a_packed32[a_offset + ib].dm);
|
||||
return dm;
|
||||
return vec2(float(data_a[a_offset + ib].d), float(data_a[a_offset + ib].m));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
#endif
|
||||
#elif defined(DATA_A_Q4_0)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
|
||||
|
||||
const uint ib = idx / 4;
|
||||
const uint iqs = idx & 0x03;
|
||||
@@ -63,15 +63,16 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
buf_a[buf_idx + 9] = FLOAT_TYPE_VEC2(v1.zw);
|
||||
#elif defined(DATA_A_Q4_1)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
|
||||
|
||||
const uint ib = idx / 4;
|
||||
const uint iqs = idx & 0x03;
|
||||
|
||||
const vec2 dm = vec2(data_a_packed32[ib].dm);
|
||||
const uint vui = data_a_packed32[ib].qs[iqs];
|
||||
const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * dm.x + dm.y;
|
||||
const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * dm.x + dm.y;
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const float m = float(data_a_packed16[ib].m);
|
||||
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
|
||||
const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
|
||||
const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v0.xy);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE_VEC2(v0.zw);
|
||||
@@ -79,7 +80,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
buf_a[buf_idx + 9 ] = FLOAT_TYPE_VEC2(v1.zw);
|
||||
#elif defined(DATA_A_Q5_0)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
@@ -96,26 +97,22 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v.yw);
|
||||
#elif defined(DATA_A_Q5_1)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row;
|
||||
|
||||
const uint ib = idx / 4;
|
||||
const uint iqs = idx & 0x03;
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const vec2 dm = vec2(data_a_packed32[ib].dm);
|
||||
const uint uint_qh = data_a_packed32[ib].qh;
|
||||
const uvec2 qh0 = uvec2(((uint_qh >> 4*iqs) << 4) & 0x10, (uint_qh >> (4*iqs + 12)) & 0x10);
|
||||
const uvec2 qh1 = uvec2(((uint_qh >> (4*iqs + 1)) << 4) & 0x10, (uint_qh >> (4*iqs + 13)) & 0x10);
|
||||
const uvec2 qh2 = uvec2(((uint_qh >> (4*iqs + 2)) << 4) & 0x10, (uint_qh >> (4*iqs + 14)) & 0x10);
|
||||
const uvec2 qh3 = uvec2(((uint_qh >> (4*iqs + 3)) << 4) & 0x10, (uint_qh >> (4*iqs + 15)) & 0x10);
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const float m = float(data_a_packed16[ib].m);
|
||||
const uint uint_qh = data_a_packed16[ib].qh;
|
||||
const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
|
||||
const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
|
||||
|
||||
const uint vui = data_a_packed32[ib].qs[iqs];
|
||||
const vec4 v0 = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * dm.x + dm.y;
|
||||
const vec4 v1 = vec4(((vui >> 16) & 0xF) | qh2.x, ((vui >> 20) & 0xF) | qh2.y, ((vui >> 24) & 0xF) | qh3.x, ((vui >> 28) & 0xF) | qh3.y) * dm.x + dm.y;
|
||||
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
||||
const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v0.xz);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v1.xz);
|
||||
buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v0.yw);
|
||||
buf_a[buf_idx + 9] = FLOAT_TYPE_VEC2(v1.yw);
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xz);
|
||||
buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v.yw);
|
||||
#elif defined(DATA_A_Q8_0)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
@@ -134,21 +131,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint iqs = (idx % 64) * 2; // 0,2,4..126
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint qsi = (iqs / 64) * 16 + (iqs % 16); // 0..15
|
||||
const uint scalesi = iqs / 8; // 0..15
|
||||
const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||
|
||||
const vec4 qs = vec4(unpack8((data_a_packed32[ib].qs[qsi / 2] >> qsshift) & 0x03030303));
|
||||
const uvec2 qs = uvec2(unpack8(data_a_packed16[ib].qs[qsi]));
|
||||
const uint scales = data_a[ib].scales[scalesi];
|
||||
const vec2 dm = vec2(data_a[ib].dm);
|
||||
|
||||
const vec4 v = dm.x * float(scales & 0xF) * qs - dm.y * float(scales >> 4);
|
||||
const vec2 v = dm.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - dm.y * float(scales >> 4);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xy);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v.zw);
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy);
|
||||
#elif defined(DATA_A_Q3_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
@@ -177,8 +173,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint iqs = (idx % 64) * 2; // 0,2,4..126
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 32; // 0,1,2,3
|
||||
const uint b = (iqs % 32) / 16; // 0,1
|
||||
@@ -204,16 +200,16 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
const vec4 q = vec4(unpack8((data_a_packed32[ib].qs[qsi / 4] >> (b * 4)) & 0x0F0F0F0F));
|
||||
const vec2 q = vec2(unpack8((uint(data_a_packed16[ib].qs[qsi / 2]) >> (b * 4)) & 0x0F0F).xy);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(fma(d, q.x, m), fma(d, q.y, m));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(fma(d, q.z, m), fma(d, q.w, m));
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, q.x, m),
|
||||
fma(d, q.y, m));
|
||||
#elif defined(DATA_A_Q5_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint iqs = (idx % 64) * 2; // 0,2,4..126
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 32; // 0,1,2,3
|
||||
const uint b = (iqs % 32) / 16; // 0,1
|
||||
@@ -240,12 +236,12 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
const uint qs = (data_a_packed32[ib].qs[qsi / 4] >> (b * 4)) & 0x0F0F0F0F;
|
||||
const uint qh = ((data_a_packed32[ib].qh[qhi / 4] >> (iqs / 16)) & 0x01010101) << 4;
|
||||
const vec4 q = vec4(unpack8(qs | qh));
|
||||
const uint qs = (uint(data_a_packed16[ib].qs[qsi / 2]) >> (b * 4)) & 0x0F0F;
|
||||
const uint qh = ((uint(data_a_packed16[ib].qh[qhi / 2]) >> (iqs / 16)) & 0x0101) << 4;
|
||||
const vec2 q = vec2(unpack8(qs | qh).xy);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(fma(d, q.x, m), fma(d, q.y, m));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(fma(d, q.z, m), fma(d, q.w, m));
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, q.x, m),
|
||||
fma(d, q.y, m));
|
||||
#elif defined(DATA_A_Q6_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
@@ -459,7 +455,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xy);
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
@@ -473,7 +469,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
kvalues_iq4nl[vui >> 12]);
|
||||
#elif defined(DATA_A_MXFP4)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = (idx & 0x07) * 2;
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#if USE_SUBGROUP_ADD
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : enable
|
||||
#endif
|
||||
@@ -10,8 +9,7 @@
|
||||
|
||||
layout(constant_id = 0) const uint D_STATE = 128;
|
||||
layout(constant_id = 1) const uint SUBGROUP_SIZE = 32;
|
||||
|
||||
const uint32_t c_factor = D_STATE / SUBGROUP_SIZE;
|
||||
layout(constant_id = 2) const uint SPLIT_H = 16;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
@@ -43,28 +41,22 @@ float softplus(float x) {
|
||||
}
|
||||
}
|
||||
|
||||
#if !USE_SUBGROUP_ADD
|
||||
shared float temp[D_STATE];
|
||||
#endif
|
||||
shared float stateC[SPLIT_H * D_STATE];
|
||||
|
||||
void main() {
|
||||
const uint subgroup = gl_SubgroupID;
|
||||
const uint lane = gl_SubgroupInvocationID;
|
||||
const uint tid = gl_SubgroupID * SUBGROUP_SIZE + lane;
|
||||
const uint subgroup_idx = gl_WorkGroupID.x * c_factor + subgroup;
|
||||
|
||||
const uint head_idx = subgroup_idx / d_head;
|
||||
const uint head_off = (subgroup_idx % d_head) * 4;
|
||||
const uint seq_idx = gl_WorkGroupID.y;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint head_idx = (gl_WorkGroupID.x * SPLIT_H) / d_head;
|
||||
const uint head_off = ((gl_WorkGroupID.x * SPLIT_H) % d_head) * 4;
|
||||
const uint seq_idx = gl_WorkGroupID.y;
|
||||
|
||||
const uint group_off = (head_idx / (n_head / n_group)) * D_STATE * 4;
|
||||
const uint s0_base_idx = (uint(ids[seq_idx]) * nb03 + head_idx * nb02 + head_off * D_STATE) / 4;
|
||||
const uint x_base_idx = (seq_idx * nb13 + subgroup_idx * 4) / 4;
|
||||
const uint x_base_idx = (seq_idx * nb13 + gl_WorkGroupID.x * SPLIT_H * 4) / 4;
|
||||
const uint dt_base_idx = (seq_idx * nb22 + head_idx * 4) / 4;
|
||||
const uint A_base_idx = (head_idx * nb31) / 4;
|
||||
const uint B_base_idx = (seq_idx * nb43 + group_off) / 4;
|
||||
const uint C_base_idx = (seq_idx * nb53 + group_off) / 4;
|
||||
const uint y_base_idx = seq_idx * n_tok * n_head * d_head + subgroup_idx;
|
||||
const uint y_base_idx = seq_idx * n_tok * n_head * d_head + gl_WorkGroupID.x * SPLIT_H;
|
||||
const uint s_base_idx = (s_off + seq_idx * nb03 + head_idx * nb02 + head_off * D_STATE) / 4;
|
||||
|
||||
const uint stride_x = nb12 / 4;
|
||||
@@ -73,52 +65,76 @@ void main() {
|
||||
const uint stride_C = nb52 / 4;
|
||||
const uint stride_y = n_head * d_head;
|
||||
|
||||
float state[c_factor];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < c_factor; j++) {
|
||||
state[j] = s0[s0_base_idx + SUBGROUP_SIZE * j + lane];
|
||||
float state[SPLIT_H];
|
||||
[[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
|
||||
state[j] = s0[s0_base_idx + j * D_STATE + tid];
|
||||
}
|
||||
|
||||
float a = A[A_base_idx];
|
||||
|
||||
for (uint i = 0; i < n_tok; i++) {
|
||||
float dt_soft_plus = softplus(dt[dt_base_idx + i * stride_dt]);
|
||||
const float dt_soft_plus = softplus(dt[dt_base_idx + i * stride_dt]);
|
||||
|
||||
float state_sum = 0.0f;
|
||||
const float dA = exp(dt_soft_plus * A[A_base_idx]);
|
||||
|
||||
const float B_val = B[B_base_idx + i * stride_B + tid];
|
||||
const float C_val = C[C_base_idx + i * stride_C + tid];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
|
||||
const float x_dt = x[x_base_idx + i * stride_x + j] * dt_soft_plus;
|
||||
|
||||
const float dA = exp(dt_soft_plus * a);
|
||||
const float x_dt = x[x_base_idx + i * stride_x] * dt_soft_plus;
|
||||
[[unroll]] for (uint j = 0; j < c_factor; j++) {
|
||||
float B_val = B[B_base_idx + i * stride_B + SUBGROUP_SIZE * j + lane];
|
||||
float C_val = C[C_base_idx + i * stride_C + SUBGROUP_SIZE * j + lane];
|
||||
state[j] = (state[j] * dA) + (B_val * x_dt);
|
||||
state_sum += state[j] * C_val;
|
||||
|
||||
stateC[j * D_STATE + tid] = state[j] * C_val;
|
||||
}
|
||||
|
||||
#if USE_SUBGROUP_ADD
|
||||
state_sum = subgroupAdd(state_sum);
|
||||
#else
|
||||
temp[tid] = state_sum;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = SUBGROUP_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (lane < s) {
|
||||
temp[tid] += temp[tid + s];
|
||||
[[unroll]]
|
||||
for (uint w = D_STATE / 2; w >= SUBGROUP_SIZE; w >>= 1) {
|
||||
[[unroll]] for (uint j = 0; j < (w * SPLIT_H + D_STATE - 1) / D_STATE; j++) {
|
||||
const uint k = (tid % w) + (D_STATE * (tid / w)) + j * D_STATE * (D_STATE / w);
|
||||
if (k < SPLIT_H * D_STATE && (k + w) < SPLIT_H * D_STATE) {
|
||||
stateC[k] += stateC[k + w];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
// get the value from lane 0
|
||||
state_sum = temp[subgroup * SUBGROUP_SIZE];
|
||||
barrier();
|
||||
|
||||
[[unroll]] for (uint j = 0; j < max(1, SPLIT_H / (D_STATE / SUBGROUP_SIZE)); j++) {
|
||||
const uint idx = (tid % SUBGROUP_SIZE) +
|
||||
D_STATE * (tid / SUBGROUP_SIZE) +
|
||||
j * D_STATE * (D_STATE / SUBGROUP_SIZE);
|
||||
const uint max_idx = SUBGROUP_SIZE - 1 +
|
||||
D_STATE * ((D_STATE - 1) / SUBGROUP_SIZE) +
|
||||
j * D_STATE * (D_STATE / SUBGROUP_SIZE);
|
||||
|
||||
if (idx < SPLIT_H * D_STATE ||
|
||||
max_idx < SPLIT_H * D_STATE) {
|
||||
float sc;
|
||||
#if USE_SUBGROUP_ADD
|
||||
sc = stateC[idx];
|
||||
sc = subgroupAdd(sc);
|
||||
#else
|
||||
[[unroll]] for (uint offset = SUBGROUP_SIZE / 2; offset > 0; offset >>= 1) {
|
||||
if (idx + offset < SPLIT_H * D_STATE) {
|
||||
stateC[idx] += stateC[idx + offset];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid % SUBGROUP_SIZE == 0) {
|
||||
sc = stateC[idx];
|
||||
}
|
||||
#endif
|
||||
|
||||
if (lane == 0) {
|
||||
d[y_base_idx + i * stride_y] = state_sum;
|
||||
if (tid % SUBGROUP_SIZE == 0) {
|
||||
const uint k = tid / SUBGROUP_SIZE + j * (D_STATE / SUBGROUP_SIZE);
|
||||
d[y_base_idx + i * stride_y + k] = sc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier();
|
||||
}
|
||||
|
||||
// write back the state
|
||||
[[unroll]]
|
||||
for (int j = 0; j < c_factor; j++) {
|
||||
d[s_base_idx + SUBGROUP_SIZE * j + lane] = state[j];
|
||||
[[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
|
||||
d[s_base_idx + j * D_STATE + tid] = state[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -552,9 +552,9 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
std::string load_vec_quant = "2";
|
||||
if ((tname == "q4_0") || (tname == "q4_1") || (tname == "q5_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s"))
|
||||
if ((tname == "q4_0") || (tname == "q4_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s"))
|
||||
load_vec_quant = "8";
|
||||
else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4"))
|
||||
else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4"))
|
||||
load_vec_quant = "4";
|
||||
|
||||
if (tname == "bf16") {
|
||||
|
||||
@@ -2273,16 +2273,6 @@ static void ggml_webgpu_init_unary_pipeline(webgpu_context & webgpu_ctx) {
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_xielu_inplace_f32, "xielu_inplace_f32", constants);
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_XIELU][GGML_TYPE_F16][1] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_xielu_inplace_f16, "xielu_inplace_f16", constants);
|
||||
|
||||
// CEIL
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F32][0] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_f32, "ceil_f32", constants);
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F16][0] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_f16, "ceil_f16", constants);
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F32][1] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_inplace_f32, "ceil_inplace_f32", constants);
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F16][1] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_inplace_f16, "ceil_inplace_f16", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_scale_pipeline(webgpu_context & webgpu_ctx) {
|
||||
@@ -2538,7 +2528,6 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_XIELU:
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
supports_op = supports_op =
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
|
||||
@@ -16,8 +16,7 @@
|
||||
"HARDSWISH_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));",
|
||||
"GELU_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
|
||||
"GELU_QUICK_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
|
||||
"GELU_ERF_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
|
||||
"CEIL_FUNC": "{{MUTATE}}[dst_i] = ceil(src[src_i]);"
|
||||
"GELU_ERF_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458"
|
||||
}
|
||||
|
||||
#end(REPL_TEMPLATES)
|
||||
@@ -358,27 +357,6 @@
|
||||
"SHADER_NAME": "gelu_erf_inplace_f16",
|
||||
"REPLS": { "TYPE": "f16", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
|
||||
"DECLS": ["INPLACE"]
|
||||
},
|
||||
|
||||
{
|
||||
"SHADER_NAME": "ceil_f32",
|
||||
"REPLS": { "TYPE": "f32", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
|
||||
"DECLS": ["NOT_INPLACE"]
|
||||
},
|
||||
{
|
||||
"SHADER_NAME": "ceil_f16",
|
||||
"REPLS": { "TYPE": "f16", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
|
||||
"DECLS": ["NOT_INPLACE"]
|
||||
},
|
||||
{
|
||||
"SHADER_NAME": "ceil_inplace_f32",
|
||||
"REPLS": { "TYPE": "f32", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
|
||||
"DECLS": ["INPLACE"]
|
||||
},
|
||||
{
|
||||
"SHADER_NAME": "ceil_inplace_f16",
|
||||
"REPLS": { "TYPE": "f16", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
|
||||
"DECLS": ["INPLACE"]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
+3
-5
@@ -53,15 +53,13 @@
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// Needed for ggml_fp32_to_bf16_row()
|
||||
#if defined(__AVX512BF16__)
|
||||
#if defined(_MSC_VER)
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
#else
|
||||
#include <immintrin.h>
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
#endif // defined(_MSC_VER)
|
||||
#endif // defined(__AVX512BF16__)
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || \
|
||||
defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
|
||||
|
||||
@@ -104,7 +104,6 @@ class Keys:
|
||||
VOCAB_SIZE = "{arch}.vocab_size"
|
||||
CONTEXT_LENGTH = "{arch}.context_length"
|
||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
EMBEDDING_LENGTH_OUT = "{arch}.embedding_length_out"
|
||||
FEATURES_LENGTH = "{arch}.features_length"
|
||||
BLOCK_COUNT = "{arch}.block_count"
|
||||
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
|
||||
@@ -3039,7 +3038,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.DENSE_2_OUT, # LFM2-ColBert-350M
|
||||
],
|
||||
MODEL_ARCH.LFM2MOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
||||
@@ -681,9 +681,6 @@ class GGUFWriter:
|
||||
def add_embedding_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_embedding_length_out(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH_OUT.format(arch=self.arch), length)
|
||||
|
||||
def add_features_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ python = ">=3.8"
|
||||
numpy = ">=1.17"
|
||||
tqdm = ">=4.27"
|
||||
pyyaml = ">=5.1"
|
||||
requests = ">=2.25"
|
||||
sentencepiece = { version = ">=0.1.98,<=0.2.0", optional = true }
|
||||
PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true }
|
||||
|
||||
|
||||
+1
-3
@@ -309,7 +309,6 @@ extern "C" {
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_direct_io; // use direct io, takes precedence over use_mmap
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||
@@ -495,7 +494,7 @@ extern "C" {
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
size_t margin, // margin of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
@@ -536,7 +535,6 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
|
||||
Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
|
||||
Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
@@ -1,67 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# intialize a new worktree from a PR number:
|
||||
#
|
||||
# - creates a new remote using the fork's clone URL
|
||||
# - creates a local branch tracking the remote branch
|
||||
# - creates a new worktree in a parent folder, suffixed with "-pr-${PR}"
|
||||
#
|
||||
# sample usage:
|
||||
# ./scripts/pr2wt.sh 12345
|
||||
# ./scripts/pr2wt.sh 12345 opencode
|
||||
# ./scripts/pr2wt.sh 12345 "cmake -B build && cmake --build build"
|
||||
|
||||
function usage() {
|
||||
echo "usage: $0 <pr_number> [cmd]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# check we are in the right directory
|
||||
if [[ ! -f "scripts/pr2wt.sh" ]]; then
|
||||
echo "error: this script must be run from the root of the repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ $# -lt 1 || $# -gt 2 ]]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
PR=$1
|
||||
[[ "$PR" =~ ^[0-9]+$ ]] || { echo "error: PR number must be numeric"; exit 1; }
|
||||
|
||||
url_origin=$(git config --get remote.origin.url) || {
|
||||
echo "error: no remote named 'origin' in this repository"
|
||||
exit 1
|
||||
}
|
||||
|
||||
org_repo=$(echo $url_origin | cut -d/ -f4-)
|
||||
org_repo=${org_repo%.git}
|
||||
|
||||
echo "org/repo: $org_repo"
|
||||
|
||||
meta=$(curl -sSf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/${org_repo}/pulls/${PR}")
|
||||
|
||||
url_remote=$(echo "$meta" | jq -r '.head.repo.clone_url')
|
||||
head_ref=$(echo "$meta" | jq -r '.head.ref')
|
||||
|
||||
echo "url: $url_remote"
|
||||
echo "head_ref: $head_ref"
|
||||
|
||||
git remote rm pr/${PR} 2> /dev/null
|
||||
git remote add pr/${PR} $url_remote
|
||||
git fetch pr/${PR} $head_ref
|
||||
|
||||
dir=$(basename $(pwd))
|
||||
|
||||
git branch -D pr/$PR 2> /dev/null
|
||||
git worktree add -b pr/$PR ../$dir-pr-$PR pr/$PR/${head_ref} 2> /dev/null
|
||||
|
||||
wt_path=$(cd ../$dir-pr-$PR && pwd)
|
||||
|
||||
echo "git worktree created in $wt_path"
|
||||
|
||||
# if a command was provided, execute it
|
||||
if [[ $# -eq 2 ]]; then
|
||||
cd ../$dir-pr-$PR
|
||||
eval "$2"
|
||||
fi
|
||||
@@ -16,14 +16,8 @@ model="Llama-3.2-3B-Instruct-Q4_0.gguf"
|
||||
device="HTP0"
|
||||
[ "$D" != "" ] && device="$D"
|
||||
|
||||
verbose=
|
||||
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
|
||||
|
||||
experimental=
|
||||
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
|
||||
|
||||
profile=
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
|
||||
verbose=""
|
||||
[ "$V" != "" ] && verbose="$V"
|
||||
|
||||
opmask=
|
||||
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||
@@ -40,7 +34,7 @@ adb $adbserial shell " \
|
||||
cd $basedir; \
|
||||
LD_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
$ndev $nhvx $opmask $verbose $experimental $profile ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
|
||||
$ndev $nhvx $opmask ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--batch-size 128 -ngl 99 $cli_opts $@ \
|
||||
--batch-size 128 -ngl 99 $@ \
|
||||
"
|
||||
|
||||
@@ -16,7 +16,7 @@ vendor = {
|
||||
# "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.23/miniaudio.h": "vendor/miniaudio/miniaudio.h",
|
||||
"https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h",
|
||||
|
||||
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.0/httplib.h": "vendor/cpp-httplib/httplib.h",
|
||||
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.28.0/httplib.h": "vendor/cpp-httplib/httplib.h",
|
||||
|
||||
"https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h",
|
||||
}
|
||||
|
||||
@@ -152,7 +152,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
||||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
|
||||
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
||||
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
||||
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
||||
@@ -2076,7 +2075,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM_LFM2,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_DENSE_2_OUT,
|
||||
};
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
return {
|
||||
|
||||
@@ -156,7 +156,6 @@ enum llm_kv {
|
||||
LLM_KV_VOCAB_SIZE,
|
||||
LLM_KV_CONTEXT_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH_OUT,
|
||||
LLM_KV_FEATURES_LENGTH,
|
||||
LLM_KV_BLOCK_COUNT,
|
||||
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
||||
|
||||
+10
-13
@@ -758,8 +758,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
const uint32_t n_embd_out = model.hparams.get_n_embd_out();
|
||||
return embd + j*n_embd_out;
|
||||
return embd + j*model.hparams.n_embd;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -1195,10 +1194,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
const uint32_t n_embd_out = hparams.get_n_embd_out();
|
||||
|
||||
GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
|
||||
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_MEAN:
|
||||
case LLAMA_POOLING_TYPE_CLS:
|
||||
@@ -1602,13 +1600,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
const uint32_t n_embd_out = hparams.get_n_embd_out();
|
||||
float * embd_out = embd + n_outputs_prev*n_embd_out;
|
||||
float * embd_out = embd + n_outputs_prev*n_embd;
|
||||
|
||||
if (n_outputs) {
|
||||
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
|
||||
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
|
||||
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
|
||||
}
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_MEAN:
|
||||
@@ -1733,9 +1730,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
|
||||
|
||||
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
|
||||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd_out = hparams.get_n_embd_out();
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
bool has_logits = true;
|
||||
bool has_embd = cparams.embeddings;
|
||||
@@ -1776,7 +1773,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
|
||||
|
||||
// Allocate CPU logits buffer only if needed by sequences in this batch
|
||||
logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
|
||||
embd_size = has_embd ? n_embd_out*n_outputs_max : 0;
|
||||
embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
||||
|
||||
// TODO: avoid this branching by working with the worst-case
|
||||
if (!has_sampling) {
|
||||
|
||||
+3
-7
@@ -2071,18 +2071,14 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||
void llm_graph_context::build_dense_out(
|
||||
ggml_tensor * dense_2,
|
||||
ggml_tensor * dense_3) const {
|
||||
if (!cparams.embeddings || !(dense_2 || dense_3)) {
|
||||
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
|
||||
return;
|
||||
}
|
||||
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
|
||||
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
|
||||
|
||||
if (dense_2) {
|
||||
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
||||
}
|
||||
if (dense_3) {
|
||||
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
||||
}
|
||||
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
||||
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
||||
cb(cur, "result_embd_pooled", -1);
|
||||
res->t_embd_pooled = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
@@ -72,10 +72,6 @@ uint32_t llama_hparams::n_embd_inp() const {
|
||||
return n_embd_inp;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::get_n_embd_out() const {
|
||||
return n_embd_out > 0 ? n_embd_out : n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
|
||||
@@ -162,9 +162,6 @@ struct llama_hparams {
|
||||
// for Classifiers
|
||||
uint32_t n_cls_out = 1;
|
||||
|
||||
// output embedding dimension (0 = use n_embd)
|
||||
uint32_t n_embd_out = 0;
|
||||
|
||||
// llama4 smallthinker
|
||||
uint32_t n_moe_layer_step = 0;
|
||||
uint32_t n_no_rope_layer_step = 4;
|
||||
@@ -237,9 +234,6 @@ struct llama_hparams {
|
||||
// dimension of main + auxiliary input embeddings
|
||||
uint32_t n_embd_inp() const;
|
||||
|
||||
// dimension of output embeddings
|
||||
uint32_t get_n_embd_out() const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
||||
|
||||
+39
-72
@@ -110,7 +110,7 @@ struct llama_file::impl {
|
||||
}
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) {
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
size_t bytes_read = 0;
|
||||
while (bytes_read < len) {
|
||||
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
||||
@@ -127,7 +127,7 @@ struct llama_file::impl {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t read_u32() {
|
||||
uint32_t read_u32() const {
|
||||
uint32_t val;
|
||||
read_raw(&val, sizeof(val));
|
||||
return val;
|
||||
@@ -154,8 +154,8 @@ struct llama_file::impl {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
bool has_direct_io() const {
|
||||
return true;
|
||||
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
|
||||
throw std::runtime_error("DirectIO is not implemented on Windows.");
|
||||
}
|
||||
|
||||
~impl() {
|
||||
@@ -164,45 +164,33 @@ struct llama_file::impl {
|
||||
}
|
||||
}
|
||||
#else
|
||||
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
|
||||
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
||||
#ifdef __linux__
|
||||
// Try unbuffered I/O for read only
|
||||
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
||||
if (init_fd()) {
|
||||
fd = open(fname, O_RDONLY | O_DIRECT);
|
||||
|
||||
if (fd != -1) {
|
||||
struct stat file_stats{};
|
||||
fstat(fd, &file_stats);
|
||||
|
||||
size = file_stats.st_size;
|
||||
alignment = file_stats.st_blksize;
|
||||
|
||||
off_t ret = lseek(fd, 0, SEEK_SET);
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
||||
}
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
|
||||
fname, strerror(errno));
|
||||
|
||||
LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
|
||||
fname, strerror(errno));
|
||||
}
|
||||
#endif
|
||||
init_fp(mode);
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
bool init_fd() {
|
||||
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
|
||||
|
||||
if (fd != -1) {
|
||||
struct stat file_stats{};
|
||||
fstat(fd, &file_stats);
|
||||
|
||||
size = file_stats.st_size;
|
||||
alignment = file_stats.st_blksize;
|
||||
|
||||
off_t ret = lseek(fd, 0, SEEK_SET);
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
void init_fp(const char * mode) {
|
||||
fp = ggml_fopen(fname.c_str(), mode);
|
||||
fp = ggml_fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
@@ -238,7 +226,7 @@ struct llama_file::impl {
|
||||
}
|
||||
}
|
||||
|
||||
void read_raw_unsafe(void * ptr, size_t len) {
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -261,17 +249,6 @@ struct llama_file::impl {
|
||||
if (errno == EINTR) {
|
||||
continue; // Interrupted by signal, retry
|
||||
}
|
||||
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
||||
if (errno == EFAULT) {
|
||||
auto curr_off = tell();
|
||||
close(fd);
|
||||
fd = -1;
|
||||
alignment = 1;
|
||||
init_fp("rb");
|
||||
seek(curr_off, SEEK_SET);
|
||||
read_raw_unsafe(ptr, len);
|
||||
return;
|
||||
}
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret == 0) {
|
||||
@@ -289,8 +266,7 @@ struct llama_file::impl {
|
||||
}
|
||||
}
|
||||
|
||||
void read_aligned_chunk(void * dest, size_t size) {
|
||||
size_t offset = tell();
|
||||
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
|
||||
off_t aligned_offset = offset & ~(alignment - 1);
|
||||
off_t offset_from_alignment = offset - aligned_offset;
|
||||
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
||||
@@ -307,21 +283,13 @@ struct llama_file::impl {
|
||||
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
||||
|
||||
seek(aligned_offset, SEEK_SET);
|
||||
read_raw_unsafe(buffer.get(), bytes_to_read);
|
||||
read_raw(buffer.get(), bytes_to_read);
|
||||
|
||||
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
||||
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) {
|
||||
if (has_direct_io()) {
|
||||
read_aligned_chunk(ptr, len);
|
||||
} else {
|
||||
read_raw_unsafe(ptr, len);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t read_u32() {
|
||||
uint32_t read_u32() const {
|
||||
uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
@@ -342,10 +310,6 @@ struct llama_file::impl {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
bool has_direct_io() const {
|
||||
return fd != -1 && alignment > 1;
|
||||
}
|
||||
|
||||
~impl() {
|
||||
if (fd != -1) {
|
||||
close(fd);
|
||||
@@ -354,9 +318,17 @@ struct llama_file::impl {
|
||||
}
|
||||
}
|
||||
int fd = -1;
|
||||
std::string fname;
|
||||
#endif
|
||||
|
||||
void read_raw_at(void * ptr, size_t len, size_t offset) const {
|
||||
if (alignment != 1) {
|
||||
read_aligned_chunk(offset, ptr, len);
|
||||
} else {
|
||||
seek(offset, SEEK_SET);
|
||||
read_raw(ptr, len);
|
||||
}
|
||||
}
|
||||
|
||||
size_t read_alignment() const {
|
||||
return alignment;
|
||||
}
|
||||
@@ -375,7 +347,6 @@ size_t llama_file::tell() const { return pimpl->tell(); }
|
||||
size_t llama_file::size() const { return pimpl->size; }
|
||||
|
||||
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
||||
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
|
||||
|
||||
int llama_file::file_id() const {
|
||||
#ifdef _WIN32
|
||||
@@ -390,14 +361,10 @@ int llama_file::file_id() const {
|
||||
}
|
||||
|
||||
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
||||
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
||||
#ifdef _WIN32
|
||||
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
||||
#else
|
||||
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
|
||||
#endif
|
||||
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
|
||||
void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
|
||||
|
||||
uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
|
||||
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
|
||||
|
||||
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
|
||||
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
|
||||
|
||||
+4
-5
@@ -24,16 +24,15 @@ struct llama_file {
|
||||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
void read_raw(void * ptr, size_t len);
|
||||
void read_raw_unsafe(void * ptr, size_t len);
|
||||
void read_aligned_chunk(void * dest, size_t size);
|
||||
uint32_t read_u32();
|
||||
void read_raw(void * ptr, size_t len) const;
|
||||
void read_raw_at(void * ptr, size_t len, size_t offset) const;
|
||||
void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
|
||||
uint32_t read_u32() const;
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const;
|
||||
void write_u32(uint32_t val) const;
|
||||
|
||||
size_t read_alignment() const;
|
||||
bool has_direct_io() const;
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
|
||||
@@ -495,7 +495,6 @@ llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
@@ -528,17 +527,9 @@ llama_model_loader::llama_model_loader(
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
||||
|
||||
// Disable mmap in case Direct I/O is enabled and available
|
||||
if (use_direct_io && use_mmap) {
|
||||
use_mmap = false;
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
}
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
@@ -604,7 +595,7 @@ llama_model_loader::llama_model_loader(
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
@@ -748,7 +739,6 @@ llama_model_loader::llama_model_loader(
|
||||
}
|
||||
|
||||
this->use_mmap = use_mmap;
|
||||
this->use_direct_io = use_direct_io;
|
||||
this->check_tensors = check_tensors;
|
||||
this->no_alloc = no_alloc;
|
||||
}
|
||||
@@ -1110,8 +1100,7 @@ bool llama_model_loader::load_all_data(
|
||||
const auto & file = files.at(weight->idx);
|
||||
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(cur->data, n_size);
|
||||
file->read_raw_at(cur->data, n_size, weight->offs);
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
||||
@@ -1143,7 +1132,7 @@ bool llama_model_loader::load_all_data(
|
||||
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||
|
||||
// Read aligned chunk from file
|
||||
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
||||
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
||||
|
||||
// Calculate actual data portion (excluding alignment padding)
|
||||
uintptr_t ptr_data = ptr_dest_aligned;
|
||||
@@ -1173,8 +1162,7 @@ bool llama_model_loader::load_all_data(
|
||||
}
|
||||
} else {
|
||||
read_buf.resize(n_size);
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(read_buf.data(), n_size);
|
||||
file->read_raw_at(read_buf.data(), n_size, weight->offs);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
||||
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
||||
|
||||
@@ -70,7 +70,6 @@ struct llama_model_loader {
|
||||
size_t n_bytes = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
bool use_direct_io = false;
|
||||
bool check_tensors;
|
||||
bool no_alloc;
|
||||
|
||||
@@ -98,7 +97,6 @@ struct llama_model_loader {
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
|
||||
@@ -146,9 +146,6 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
||||
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
if (hparams.n_embd_out > 0) {
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
|
||||
}
|
||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||
|
||||
+1
-11
@@ -507,7 +507,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
@@ -2440,8 +2439,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
const bool use_mmap_buffer = true;
|
||||
|
||||
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
|
||||
__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
|
||||
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
||||
|
||||
// build a list of buffer types for the CPU and GPU devices
|
||||
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
|
||||
@@ -6471,9 +6469,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// for LFM2-ColBert-350M
|
||||
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
@@ -7974,7 +7969,6 @@ llama_model_params llama_model_default_params() {
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mmap =*/ true,
|
||||
/*.use_direct_io =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.check_tensors =*/ false,
|
||||
/*.use_extra_bufts =*/ true,
|
||||
@@ -8009,10 +8003,6 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
||||
return model->hparams.n_embd_inp();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_embd_out(const llama_model * model) {
|
||||
return model->hparams.get_n_embd_out();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_layer(const llama_model * model) {
|
||||
return model->hparams.n_layer;
|
||||
}
|
||||
|
||||
+1
-1
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
}
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
||||
+55
-87
@@ -147,8 +147,9 @@ class llama_params_fit_exception : public std::runtime_error {
|
||||
static void llama_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
@@ -167,12 +168,6 @@ static void llama_params_fit_impl(
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
@@ -192,10 +187,9 @@ static void llama_params_fit_impl(
|
||||
|
||||
int64_t sum_free = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t min_projected_free = INT64_MAX;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_model = 0;
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
@@ -205,63 +199,45 @@ static void llama_params_fit_impl(
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
min_projected_free = std::min(min_projected_free, projected_free);
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
|
||||
projected_free >= 0 ? "surplus" : "deficit");
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
if (min_projected_free >= margin) {
|
||||
if (nd == 1) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
|
||||
if (global_surplus < 0) {
|
||||
if (nd == 1) {
|
||||
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
LLAMA_LOG_INFO(nd == 1 ?
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, margin/MiB, -global_surplus/MiB);
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
int64_t sum_used_target = sum_free - nd*margin_s;
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
// - for dense models only whole layers can be assigned to devices
|
||||
@@ -383,11 +359,6 @@ static void llama_params_fit_impl(
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
|
||||
uint32_t n_full() const {
|
||||
assert(n_layer >= n_part);
|
||||
return n_layer - n_part;
|
||||
}
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
@@ -411,7 +382,7 @@ static void llama_params_fit_impl(
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_full();
|
||||
il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
@@ -422,7 +393,7 @@ static void llama_params_fit_impl(
|
||||
+ std::to_string(ntbo) + " is insufficient for model");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
||||
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
@@ -472,9 +443,9 @@ static void llama_params_fit_impl(
|
||||
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
||||
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
|
||||
global_surplus_cpu_moe += dmd.free;
|
||||
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
@@ -493,18 +464,24 @@ static void llama_params_fit_impl(
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
targets.push_back(dmds_full[id].free - margin);
|
||||
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
for (size_t id = 0; id < nd - 1; ++id) {
|
||||
overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
|
||||
}
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
||||
if (hp_nex > 0) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
||||
}
|
||||
}
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
@@ -535,6 +512,9 @@ static void llama_params_fit_impl(
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
if (hp_nex > 0 && size_t(id) == nd - 1) {
|
||||
delta--;
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
@@ -544,8 +524,7 @@ static void llama_params_fit_impl(
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
||||
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
||||
ngl_per_device_test[id].n_part += step_size;
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
@@ -594,7 +573,7 @@ static void llama_params_fit_impl(
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
||||
for (size_t id = 0; id <= id_dense_start; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
||||
@@ -606,8 +585,12 @@ static void llama_params_fit_impl(
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
||||
assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
|
||||
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
@@ -623,7 +606,7 @@ static void llama_params_fit_impl(
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
||||
if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -642,8 +625,8 @@ static void llama_params_fit_impl(
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
@@ -661,19 +644,14 @@ static void llama_params_fit_impl(
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
||||
if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
||||
if (id < nd - 1) {
|
||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
@@ -681,10 +659,9 @@ static void llama_params_fit_impl(
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
@@ -693,10 +670,9 @@ static void llama_params_fit_impl(
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
@@ -711,25 +687,17 @@ static void llama_params_fit_impl(
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
}
|
||||
|
||||
enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const llama_params_fit_exception & e) {
|
||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
@@ -818,7 +786,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
||||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -128,15 +127,6 @@ int main(void) {
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||
assert(params.speculative.n_max == 123);
|
||||
|
||||
// multi-value args (CSV)
|
||||
argv = {"binary_name", "--lora", "file1.gguf,\"file2,2.gguf\",\"file3\"\"3\"\".gguf\",file4\".gguf"};
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.lora_adapters.size() == 4);
|
||||
assert(params.lora_adapters[0].path == "file1.gguf");
|
||||
assert(params.lora_adapters[1].path == "file2,2.gguf");
|
||||
assert(params.lora_adapters[2].path == "file3\"3\".gguf");
|
||||
assert(params.lora_adapters[3].path == "file4\".gguf");
|
||||
|
||||
// skip this part on windows, because setenv is not supported
|
||||
#ifdef _WIN32
|
||||
printf("test-arg-parser: skip on windows build\n");
|
||||
|
||||
@@ -25,6 +25,7 @@ else()
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(run)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(tts)
|
||||
add_subdirectory(mtmd)
|
||||
|
||||
@@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user