Compare commits

...

19 Commits

Author SHA1 Message Date
Ruben Ortlam 1051ecd289 vulkan: Disable large coopmat matmul configuration on proprietary AMD driver (#18763)
* vulkan: Disable large coopmat matmul configuration on proprietary AMD driver

* Also disable the large tile size
2026-01-12 07:29:35 +01:00
Xuan-Son Nguyen 0c3b7a9efe model: fix qwen3next broken due to #18683 (#18762) 2026-01-11 21:00:10 +01:00
Ruben Ortlam 0e76501e1d Vulkan: Optimize Matmul parameters for AMD GPUs with Coopmat support (#18749)
* vulkan: Enable and optimize large matmul parameter combination for AMD

* limit tuning to AMD GPUs with coopmat support

* use tx_m values instead of _l
2026-01-11 17:33:33 +01:00
Xuan-Son Nguyen 4b060bf240 security: make it clear about subtopics in server (#18754)
* security: make it clear about subtopics in server

* exclude DoS
2026-01-11 16:51:03 +01:00
Daniel Bevenius 9789e28459 debug : include LLAMA_POOLING_TYPE_UNSPECIFIED in pooling check (#18692)
* debug : include LLAMA_POOLING_TYPE_UNSPECIFIED in pooling check

This commit updates the pooling check in the debug example to
also include LLAMA_POOLING_TYPE_UNSPECIFIED and not just
LLAMA_POOLING_TYPE_NONE.

* debug : normalize both pooled and token embeddings

This commit updates debug.cpp to normalize embeddings for both pooled
and non-pooled outputs. For pooled embeddings, normalization is applied
to the single vector, and for non-pooled embeddings, normalization is
applied to each token embedding vector individually.

The motivation for this is to enable non-pooled embeddings to be
normalized which was not possible previously.
2026-01-11 16:34:41 +01:00
Georgi Gerganov 84ae04f163 tests : refactor test-backend-sampler (#18753)
* tests : use "auto", use std::string

* tests : refactor test-backend-sampler.cpp

* cmake : remove redundant declarations

* ci : use smaller model

* tests : add struct test_params

* tests : reduce logit bias 100.0f -> 10.0f
2026-01-11 17:31:03 +02:00
Xuan-Son Nguyen 506bb6e010 model: try to improve Qwen3 Next (#18683)
* qwen3next: simplify qkvz projection

* use ggml_swiglu_split

* revert swiglu_split, but remove redundant repeat()

* fix missing reshape

* rm 2 redundant transposes

* move mul_mat(k,q) to outside of chunking

* rm redundant cont

* improve g_cs_chunk

* add comments about no cont

* use std::pair instead of ggml_concat

* vectorize key_gdiff calculation

* rm unused tensor

* avoid ggml_concat inside loop

* bring back ggml_concat as it may not work on other backend

* nits
2026-01-11 12:53:33 +01:00
thom-dev-fr 79456a690a readme : update UIs (#18751) 2026-01-11 13:46:50 +02:00
Xuan-Son Nguyen 28068af789 security: narrow down the scope of what we consider a vulnerability (#18752)
* security: narrow down the scope of what we consider a vulnerability

* fix typo
2026-01-11 12:23:36 +01:00
shaofeiqi 707cbafcaa opencl: add SOFTPLUS op support (#18726) 2026-01-10 21:57:44 -08:00
Aman Gupta b137718878 test-backend-ops: fix mxfp4 tests on blackwell (#18736) 2026-01-11 01:12:57 +08:00
Johannes Gäßler d2ff4e23ac HIP: adjust RDNA3.5 MMQ kernel selction logic (#18666) 2026-01-10 17:19:01 +01:00
Perry Naseck 657a2e644b cmake : update blas logic (#18205) 2026-01-10 18:00:54 +02:00
Georgi Gerganov f307926482 server : adjust unified KV cache tests (#18716) 2026-01-10 17:51:56 +02:00
Sigbjørn Skjæret 7fdc8c893d scripts : follow api redirects in pr2wt.sh (#18739) 2026-01-10 16:04:05 +01:00
Xuan-Son Nguyen 23f82f2420 preset: allow named remote preset (#18728)
* preset: allow named remote preset

* nits: fix docs

* cont docs
2026-01-10 15:12:29 +01:00
Aaron Teo 2656c0d265 docs(ggml): update backend ops (#18734)
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2026-01-10 18:48:17 +08:00
Michael Wand 600a366478 Corrected: changed s13 = src1->nb[3] instead of nb[2] (#18724) 2026-01-10 10:16:07 +01:00
Adrien Gallouët ea23c15990 common : add --license to display embedded licenses (#18696)
This commit introduces a mechanism to embed all licenses directly
into the compiled binaries.

This eliminates the need to distribute separate LICENSE files alongside
the executable, making the binaries self-contained and simplifying
deployment.
2026-01-10 09:46:24 +01:00
37 changed files with 11362 additions and 4392 deletions
+16
View File
@@ -182,6 +182,9 @@ if (NOT MSVC)
endif()
endif()
include("cmake/license.cmake")
license_add_file("llama.cpp" "LICENSE")
#
# 3rd-party
#
@@ -235,6 +238,19 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
add_subdirectory(tools)
endif()
# Automatically add all files from the 'licenses' directory
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
foreach(FILE_PATH ${EXTRA_LICENSES})
get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
license_add_file("${NAME}" "${FILE_PATH}")
endforeach()
if (LLAMA_BUILD_COMMON)
license_generate(common)
endif()
#
# install
#
+1
View File
@@ -200,6 +200,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [BonzAI App](https://apps.apple.com/us/app/bonzai-your-local-ai-agent/id6752847988) (proprietary)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+41 -17
View File
@@ -1,12 +1,52 @@
# Security Policy
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
- [**Requirements**](#requirements)
- [**Covered Topics**](#covered-topics)
- [**Using llama.cpp securely**](#using-llamacpp-securely)
- [Untrusted models](#untrusted-models)
- [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy)
- [Untrusted environments or networks](#untrusted-environments-or-networks)
- [Multi-Tenant environments](#multi-tenant-environments)
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
## Reporting a vulnerability
If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
> [!IMPORTANT]
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
## Requirements
Before submitting your report, ensure you meet the following requirements:
- You have read this policy and fully understand it.
- AI is only permitted in an assistive capacity as stated in [AGENTS.md](AGENTS.md). We do not accept reports that are written exclusively by AI.
- Your report must include a working Proof-of-Concept in the form of a script and/or attached files.
Maintainers reserve the right to close the report if these requirements are not fulfilled.
## Covered Topics
Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
- `src/**/*`
- `ggml/**/*`
- `gguf-py/**/*`
- `tools/server/*`, **excluding** the following topics:
- Web UI
- Features marked as experimental
- Features not recommended for use in untrusted environments (e.g., router, MCP)
- Bugs that can lead to Denial-of-Service attack
Note that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities in LLaMA C++.
For vulnerabilities that fall within the `vendor` directory, please report them directly to the third-party project.
## Using llama.cpp securely
@@ -55,19 +95,3 @@ If you intend to run multiple models in parallel with shared memory, it is your
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
<!-- normal version -->
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
> [!IMPORTANT]
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
+2 -1
View File
@@ -297,7 +297,8 @@ function gg_sum_test_scripts {
}
function gg_get_model {
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
#local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
if [[ -s $gguf_0 ]]; then
echo -n "$gguf_0"
else
+40
View File
@@ -0,0 +1,40 @@
define_property(GLOBAL PROPERTY LICENSE_TEXT
BRIEF_DOCS "Embedded licenses"
FULL_DOCS "Global string containing all aggregated licenses"
)
function(license_add_file NAME FILE)
if(NOT IS_ABSOLUTE "${FILE}")
set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
endif()
if(EXISTS "${FILE}")
set(TITLE "License for ${NAME}")
string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
file(READ "${FILE}" TEXT)
get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
else()
message(WARNING "License file '${FILE}' not found")
endif()
endfunction()
function(license_generate TARGET_NAME)
message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
set(CPP_CONTENT "// Generated by CMake\n\n")
string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
string(APPEND CPP_CONTENT "${TEXT}")
string(APPEND CPP_CONTENT "nullptr\n")
string(APPEND CPP_CONTENT "};\n")
set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
if(TARGET ${TARGET_NAME})
target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
else()
message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
endif()
endfunction()
-24
View File
@@ -155,27 +155,3 @@ if (LLAMA_LLGUIDANCE)
endif ()
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
#
# copy the license files
#
# Check if running in GitHub Actions
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
message(STATUS "Running inside GitHub Actions - copying license files")
# Copy all files from licenses/ to build/bin/
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
foreach(LICENSE_FILE ${LICENSE_FILES})
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
add_custom_command(
POST_BUILD
TARGET ${TARGET}
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${LICENSE_FILE}"
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
endforeach()
endif()
+28 -7
View File
@@ -2,10 +2,10 @@
#include "chat.h"
#include "common.h"
#include "download.h"
#include "json-schema-to-grammar.h"
#include "log.h"
#include "sampling.h"
#include "download.h"
#include "preset.h"
// fix problem with std::min and std::max
@@ -48,6 +48,8 @@
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
extern const char * LICENSES[];
using json = nlohmann::ordered_json;
using namespace common_arg_utils;
@@ -279,12 +281,20 @@ static std::string clean_file_name(const std::string & fname) {
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
GGML_ASSERT(!params.model.hf_repo.empty());
// the returned hf_repo is without tag
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
// "latest" tag (default if not specified) is translated to "default" preset
if (hf_tag == "latest") {
hf_tag = "default";
}
const bool offline = params.offline;
std::string model_endpoint = get_model_endpoint();
auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
// prepare local path for caching
auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
auto preset_path = fs_get_cache_file(preset_fname);
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
const bool has_preset = status >= 200 && status < 400;
@@ -293,14 +303,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
if (has_preset) {
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
common_preset_context ctx(ex, /* only_remote_allowed */ true);
common_preset global; // unused for now
common_preset global;
auto remote_presets = ctx.load_from_ini(preset_path, global);
if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
remote_presets = ctx.cascade(global, remote_presets);
if (remote_presets.find(hf_tag) != remote_presets.end()) {
common_preset preset = remote_presets.at(hf_tag);
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
preset.apply_to_params(params);
} else {
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
}
} else {
LOG_INF("%s", "no remote preset found, skipping\n");
@@ -1030,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
exit(0);
}
));
add_opt(common_arg(
{"--license"},
"show source code license and dependencies",
[](common_params &) {
for (int i = 0; LICENSES[i]; ++i) {
printf("%s\n", LICENSES[i]);
}
exit(0);
}
));
add_opt(common_arg(
{"-cl", "--cache-list"},
"show list of models in cache",
+12 -6
View File
@@ -161,6 +161,16 @@ static bool is_http_status_ok(int status) {
return status >= 200 && status < 400;
}
std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
return {hf_repo, tag};
}
#ifdef LLAMA_USE_CURL
//
@@ -922,12 +932,8 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
const std::string & bearer_token,
bool offline,
const common_header_list & custom_headers) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// the returned hf_repo is without tag
auto [hf_repo, tag] = common_download_split_repo_tag(hf_repo_with_tag);
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
+6
View File
@@ -17,6 +17,12 @@ struct common_remote_params {
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
// split HF repo with tag into <repo, tag>
// for example: "user/model:tag" -> <"user/model", "tag">
// if tag is not present, default to "latest"
// example: "user/model" -> <"user/model", "latest">
std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag);
struct common_cached_model_info {
std::string manifest_path;
std::string user;
+12 -2
View File
@@ -32,8 +32,10 @@ static std::set<std::string> get_remote_preset_whitelist(const std::map<std::str
"batch-size",
"ubatch-size",
"cache-reuse",
"chat-template-kwargs",
"mmap",
// note: sampling params are automatically allowed by default
// negated args will be added automatically
// negated args will be added automatically if the positive arg is specified above
};
std::set<std::string> allowed_keys;
@@ -318,6 +320,11 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
}
LOG_DBG("loading preset: %s\n", preset.name.c_str());
for (const auto & [key, value] : section.second) {
if (key == "version") {
// skip version key (reserved for future use)
continue;
}
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
if (filter_allowed_keys && allowed_keys.find(key) == allowed_keys.end()) {
throw std::runtime_error(string_format(
@@ -334,7 +341,10 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
}
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
} else {
// TODO: maybe warn about unknown key?
throw std::runtime_error(string_format(
"option '%s' not recognized in preset '%s'",
key.c_str(), preset.name.c_str()
));
}
}
+31 -1
View File
@@ -4367,7 +4367,37 @@ class Qwen3NextModel(Qwen2MoeModel):
elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
data_torch = data_torch + 1
yield from super().modify_tensors(data_torch, name, bid)
if "in_proj_qkvz.weight" in name:
# original order: [q, k, v, z] * head_count
# corrected order: [q * head_count, k * head_count, v * head_count, z * head_count]
head_k_dim = self.hparams["linear_key_head_dim"]
head_v_dim = self.hparams["linear_value_head_dim"]
num_v_heads = self.hparams["linear_num_value_heads"]
num_k_heads = self.hparams["linear_num_key_heads"]
hidden_size = self.hparams["hidden_size"]
split_arg_list_qkvz = [
head_k_dim, # q partition
head_k_dim, # k partition
(num_v_heads // num_k_heads * head_v_dim), # v partition
(num_v_heads // num_k_heads * head_v_dim), # z partition
]
# view as (n_embd, head_count, [q+k+v+z])
data_torch = data_torch.permute(1, 0).contiguous()
data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz))
# split into q, k, v, z
q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1)
# flatten dim + head_count
q = q.contiguous().view(hidden_size, -1)
k = k.contiguous().view(hidden_size, -1)
v = v.contiguous().view(hidden_size, -1)
z = z.contiguous().view(hidden_size, -1)
# stack back
qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous()
z = z.permute(1, 0).contiguous()
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, ".weight"), qkv)
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z)
else:
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("RND1")
+1 -4
View File
@@ -57,7 +57,6 @@ Legend:
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -71,10 +70,9 @@ Legend:
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -99,7 +97,6 @@ Legend:
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
+426 -336
View File
@@ -965,6 +965,7 @@
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
"BLAS","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
@@ -4964,6 +4965,7 @@
"BLAS","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","0","no","BLAS"
"BLAS","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","BLAS"
"BLAS","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","BLAS"
"BLAS","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","BLAS"
"BLAS","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","BLAS"
"BLAS","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","BLAS"
"BLAS","ARGMAX","type=f32,ne=[32,1,1,1]","support","0","no","BLAS"
@@ -5715,15 +5717,15 @@
"BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
"BLAS","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","0","no","BLAS"
"BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[6,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[6,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
@@ -5733,6 +5735,15 @@
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[18,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,4,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[18,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,4,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[18,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
"BLAS","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
"BLAS","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
@@ -6592,6 +6603,30 @@
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=67,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=64,n=77,k=77,bs=[12,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=576,n=512,k=576,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q4_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q5_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q5_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q8_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=mxfp4,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q2_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q3_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q4_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q5_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q6_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq2_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq2_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq2_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq3_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq1_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq1_m,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq4_nl,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq3_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq4_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=128,n=1,k=1056,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=2112,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
@@ -8916,6 +8951,11 @@
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[200000,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[200000,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[643251,3,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
"BLAS","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
"BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
@@ -8968,6 +9008,7 @@
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -8977,6 +9018,7 @@
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -8987,11 +9029,13 @@
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9001,6 +9045,7 @@
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9011,11 +9056,13 @@
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9025,6 +9072,7 @@
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9035,11 +9083,13 @@
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9049,6 +9099,7 @@
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9059,6 +9110,7 @@
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9184,6 +9236,7 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9193,6 +9246,7 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9203,11 +9257,13 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9217,6 +9273,7 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9227,11 +9284,13 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9241,6 +9300,7 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9251,11 +9311,13 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9265,6 +9327,7 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9275,6 +9338,7 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9542,333 +9606,333 @@
"BLAS","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","BLAS"
"BLAS","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","BLAS"
"BLAS","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
"BLAS","SUM","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
"BLAS","SUM_ROWS","type=f32,ne=[10,5,4,3],permute=0,slice=0","support","0","no","BLAS"
"BLAS","SUM","type=f32,ne=[11,5,6,3],permute=[0,2,1,3]","support","0","no","BLAS"
@@ -9891,8 +9955,9 @@
"BLAS","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
"BLAS","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
"BLAS","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","0","no","BLAS"
"BLAS","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","BLAS"
"BLAS","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","BLAS"
"BLAS","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","BLAS"
@@ -9914,6 +9979,7 @@
"BLAS","CUMSUM","type=f32,ne=[2048,5,4,3]","support","0","no","BLAS"
"BLAS","CUMSUM","type=f32,ne=[242004,1,1,1]","support","0","no","BLAS"
"BLAS","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","BLAS"
"BLAS","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","BLAS"
"BLAS","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
"BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","BLAS"
"BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","BLAS"
@@ -9923,17 +9989,41 @@
"BLAS","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","BLAS"
"BLAS","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","BLAS"
"BLAS","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","0","no","BLAS"
"BLAS","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","BLAS"
"BLAS","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","BLAS"
"BLAS","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[64,64,2,2]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[79,79,5,3],ne_rhs=[417,79,5,3]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,2],ne_rhs=[32,128,4,2]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[80,80,2,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[79,80,2,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[81,80,2,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[80,80,8,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[79,80,8,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[81,80,8,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[84,84,4,4],ne_rhs=[32,84,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[95,95,8,8],ne_rhs=[40,95,8,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[32,128,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,3,4],ne_rhs=[32,128,3,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,1],ne_rhs=[32,128,4,1]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[200,64,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[384,64,4,4]","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","BLAS"
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","0","no","BLAS"
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","BLAS"
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","BLAS"
Can't render this file because it is too large.
+9960 -3523
View File
File diff suppressed because it is too large Load Diff
+37
View File
@@ -58,3 +58,40 @@ temp = 0.8
ctx-size = 1024
; (and other configurations)
```
### Named presets
If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo containing a single `preset.ini` file that references the actual model(s):
```ini
[*]
mmap = 1
[gpt-oss-20b-hf]
hf = ggml-org/gpt-oss-20b-GGUF
batch-size = 2048
ubatch-size = 2048
top-p = 1.0
top-k = 0
min-p = 0.01
temp = 1.0
chat-template-kwargs = {"reasoning_effort": "high"}
[gpt-oss-120b-hf]
hf = ggml-org/gpt-oss-120b-GGUF
batch-size = 2048
ubatch-size = 2048
top-p = 1.0
top-k = 0
min-p = 0.01
temp = 1.0
chat-template-kwargs = {"reasoning_effort": "high"}
```
You can then use it via `llama-cli` or `llama-server`, example:
```sh
llama-server -hf user/repo:gpt-oss-120b-hf
```
Please make sure to provide the correct `hf-repo` for each child preset. Otherwise, you may get error: `The specified tag is not a valid quantization scheme.`
+33 -15
View File
@@ -57,11 +57,21 @@ struct callback_data {
}
};
static bool has_pooling(llama_context * ctx) {
switch (llama_pooling_type(ctx)) {
case LLAMA_POOLING_TYPE_NONE:
case LLAMA_POOLING_TYPE_UNSPECIFIED:
return false;
default:
return true;
}
}
struct output_data {
float * data_ptr = nullptr;
int data_size = 0;
std::string type_suffix;
std::vector<float> storage;
std::vector<float> embd_norm;
std::string prompt;
std::vector<llama_token> tokens;
@@ -73,24 +83,32 @@ struct output_data {
prompt = params.prompt;
if (params.embedding) {
const int n_embd = llama_model_n_embd_out(model);
const bool pooling_enabled = llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE;
const int n_embd_count = pooling_enabled ? 1 : tokens.size();
const int n_embeddings = n_embd * n_embd_count;
const int n_embd = llama_model_n_embd_out(model);
const bool pooling = has_pooling(ctx);
const int n_embd_count = pooling ? 1 : tokens.size();
const int n_floats = n_embd * n_embd_count;
float * embeddings;
if (pooling_enabled) {
embeddings = llama_get_embeddings_seq(ctx, 0);
storage.resize(n_embeddings);
common_embd_normalize(embeddings, storage.data(), n_embeddings, params.embd_normalize);
embeddings = storage.data();
} else {
embeddings = llama_get_embeddings(ctx);
float * embd_raw = pooling ? llama_get_embeddings_seq(ctx, 0) : llama_get_embeddings(ctx);
if (embd_raw == nullptr) {
throw std::runtime_error("failed to get embeddings from the model");
}
data_ptr = embeddings;
data_size = n_embeddings;
LOG_DBG("pooling_enabled: %s\n", pooling ? "true" : "false");
LOG_DBG("n_embd: %d\n", n_embd);
LOG_DBG("n_floats: %d\n", n_floats);
LOG_DBG("n_embd_count: %d\n", n_embd_count);
data_ptr = embd_raw;
data_size = n_floats;
type_suffix = "-embeddings";
if (params.embd_normalize >= 0) {
embd_norm.resize(n_floats);
for (int i = 0; i < n_embd_count; i++) {
common_embd_normalize(embd_raw+i*n_embd, embd_norm.data()+i*n_embd, n_embd, params.embd_normalize);
}
data_ptr = embd_norm.data();
}
} else {
const float * logits = llama_get_logits_ith(ctx, tokens.size() - 1);
const int n_logits = llama_vocab_n_tokens(vocab);
+17 -3
View File
@@ -32,14 +32,12 @@ if (BLAS_FOUND)
pkg_check_modules(DepBLAS openblas)
endif()
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
add_compile_definitions(GGML_BLAS_USE_BLIS)
pkg_check_modules(DepBLAS blis)
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS blas-atlas)
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS flexiblas_api)
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
add_compile_definitions(GGML_BLAS_USE_MKL)
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS mkl-sdl)
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
@@ -74,10 +72,26 @@ if (BLAS_FOUND)
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
if ("${GGML_BLAS_VENDOR}" STREQUAL "")
message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
endif()
if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
endif()
if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
add_compile_definitions(GGML_BLAS_USE_BLIS)
endif()
if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
add_compile_definitions(GGML_BLAS_USE_NVPL)
endif()
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
else()
+5 -9
View File
@@ -115,15 +115,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
#endif
}
#if defined(OPENBLAS_VERSION)
#if defined(GGML_BLAS_USE_OPENBLAS)
openblas_set_num_threads(ctx->n_threads);
#endif
#if defined(GGML_BLAS_USE_BLIS)
#elif defined(GGML_BLAS_USE_BLIS)
bli_thread_set_num_threads(ctx->n_threads);
#endif
#if defined(GGML_BLAS_USE_NVPL)
#elif defined(GGML_BLAS_USE_NVPL)
nvpl_blas_set_num_threads(ctx->n_threads);
#endif
@@ -288,7 +284,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
/* .context = */ ctx,
};
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
}
@@ -329,7 +325,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
return "BLIS";
#elif defined(GGML_BLAS_USE_NVPL)
return "NVPL";
#elif defined(OPENBLAS_VERSION)
#elif defined(GGML_BLAS_USE_OPENBLAS)
return "OpenBLAS";
#else
return "BLAS";
+11 -8
View File
@@ -190,7 +190,7 @@ void ggml_cuda_mul_mat_q(
{
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
if (use_native_mxfp4) {
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
@@ -333,28 +333,31 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
}
if (amd_wmma_available(cc)) {
// RDNA 4 is consistently worse on rocblas
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
// High expert counts almost always better on MMQ
// due to a large amount of graph splits
// High expert counts are almost always better on MMQ due to
// the synchronization overhead in the cuBLAS/hipBLAS path:
// https://github.com/ggml-org/llama.cpp/pull/18202
if (n_experts >= 64) {
return true;
}
// For some quantization types MMQ can have lower peak TOPS than hipBLAS
// so it's only faster for sufficiently small batch sizes:
switch (type) {
// These quants are really bad on MMQ
case GGML_TYPE_Q2_K:
return ne11 <= 128;
case GGML_TYPE_Q6_K:
// These quants are usually worse but not always
return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
return ne11 <= 128;
return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
default:
return true;
}
}
// For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
return true;
}
+1
View File
@@ -122,6 +122,7 @@ set(GGML_OPENCL_KERNELS
upscale
tanh
expm1
softplus
pad
repeat
mul_mat_f16_f32
+138
View File
@@ -540,6 +540,8 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_tanh_f16_nd;
cl_kernel kernel_expm1_f32_nd;
cl_kernel kernel_expm1_f16_nd;
cl_kernel kernel_softplus_f32_nd;
cl_kernel kernel_softplus_f16_nd;
cl_kernel kernel_upscale;
cl_kernel kernel_upscale_bilinear;
cl_kernel kernel_concat_f32_contiguous;
@@ -1826,6 +1828,31 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK(clReleaseProgram(prog));
}
// softplus
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "softplus.cl.h"
};
#else
const std::string kernel_src = read_file("softplus.cl");
#endif
cl_program prog;
if (!kernel_src.empty()) {
prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
GGML_LOG_CONT(".");
} else {
GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
prog = nullptr;
backend_ctx->kernel_softplus_f32_nd = nullptr;
backend_ctx->kernel_softplus_f16_nd = nullptr;
}
CL_CHECK(clReleaseProgram(prog));
}
// upscale
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3138,6 +3165,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_UNARY_OP_EXPM1:
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
case GGML_UNARY_OP_SOFTPLUS:
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
default:
return false;
}
@@ -6596,6 +6626,108 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
}
static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
UNUSED(src1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
cl_kernel kernel;
if (dst->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_softplus_f32_nd;
} else if (dst->type == GGML_TYPE_F16) {
kernel = backend_ctx->kernel_softplus_f16_nd;
} else {
GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
}
GGML_ASSERT(kernel != nullptr);
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = dst->ne[0];
const int ne11 = dst->ne[1];
const int ne12 = dst->ne[2];
const int ne13 = dst->ne[3];
const cl_ulong nb10 = dst->nb[0];
const cl_ulong nb11 = dst->nb[1];
const cl_ulong nb12 = dst->nb[2];
const cl_ulong nb13 = dst->nb[3];
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
size_t global_work_size[3];
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
return;
}
global_work_size[0] = (size_t)ne10;
global_work_size[1] = (size_t)ne11;
global_work_size[2] = (size_t)ne12;
size_t lws0 = 16, lws1 = 4, lws2 = 1;
if (ne10 < 16) lws0 = ne10;
if (ne11 < 4) lws1 = ne11;
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
size_t local_work_size[] = {lws0, lws1, lws2};
size_t* local_work_size_ptr = local_work_size;
if (!backend_ctx->non_uniform_workgroups) {
if (global_work_size[0] % local_work_size[0] != 0 ||
global_work_size[1] % local_work_size[1] != 0 ||
global_work_size[2] % local_work_size[2] != 0) {
local_work_size_ptr = NULL;
}
}
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
}
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -9775,6 +9907,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_expm1;
break;
case GGML_UNARY_OP_SOFTPLUS:
if (!any_on_device) {
return false;
}
func = ggml_cl_softplus;
break;
default:
return false;
} break;
+88
View File
@@ -0,0 +1,88 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// softplus
//------------------------------------------------------------------------------
inline float softplus_f32(float x){
float ax = fabs(x);
float m = fmax(x, 0.0f);
return log1p(exp(-ax)) + m;
}
kernel void kernel_softplus_f32_nd(
global void * p_src0_base,
ulong off_src0_abs,
global void * p_dst_base,
ulong off_dst_abs,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13
) {
int i0 = get_global_id(0);
int i1 = get_global_id(1);
int i2 = get_global_id(2);
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
for (int i3 = 0; i3 < ne13; ++i3) {
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
*dst_val_ptr = softplus_f32(*src_val_ptr);
}
}
}
kernel void kernel_softplus_f16_nd(
global void * p_src0_base,
ulong off_src0_abs,
global void * p_dst_base,
ulong off_dst_abs,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13
) {
int i0 = get_global_id(0);
int i1 = get_global_id(1);
int i2 = get_global_id(2);
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
for (int i3 = 0; i3 < ne13; ++i3) {
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
*dst_val_ptr = (half)(softplus_f32((float)(*src_val_ptr)));
}
}
}
+6 -1
View File
@@ -3002,6 +3002,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
} else if (device->vendor_id == VK_VENDOR_ID_AMD && device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary) {
// This is intentionally using tx_m values, slight performance increase
l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 };
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) {
// Xe2/Xe3 with coopmat enabled - warptile performance tuning
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
@@ -5078,7 +5083,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
switch (device->vendor_id) {
#ifndef GGML_VULKAN_RUN_TESTS
case VK_VENDOR_ID_AMD:
device->mul_mat_l[i] = false;
device->mul_mat_l[i] = device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary;
device->mul_mat_m[i] = true;
device->mul_mat_s[i] = true;
device->mul_mat_id_l[i] = false;
+1
View File
@@ -1738,6 +1738,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.ATTN_GATE,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
+17 -4
View File
@@ -1,9 +1,22 @@
Copyright (c) 1996 - 2025, Daniel Stenberg, daniel@haxx.se, and many contributors, see the THANKS file.
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1996 - 2026, Daniel Stenberg, <daniel@haxx.se>, and many
contributors, see the THANKS file.
All rights reserved.
Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
Permission to use, copy, modify, and distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright
notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
OR OTHER DEALINGS IN THE SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization of the copyright holder.
Except as contained in this notice, the name of a copyright holder shall not
be used in advertising or otherwise to promote the sale, use or other dealings
in this Software without prior written authorization of the copyright holder.
+1 -1
View File
@@ -40,7 +40,7 @@ org_repo=${org_repo%.git}
echo "org/repo: $org_repo"
meta=$(curl -sSf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")
meta=$(curl -sSLf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")
url_remote=$(echo "$meta" | jq -r '.head.repo.clone_url')
head_ref=$(echo "$meta" | jq -r '.head.ref')
+1
View File
@@ -17,6 +17,7 @@ vendor = {
"https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h",
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.0/httplib.h": "vendor/cpp-httplib/httplib.h",
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.0/LICENSE": "vendor/cpp-httplib/LICENSE",
"https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h",
}
+2
View File
@@ -950,6 +950,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_GATE,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS,
+4 -1
View File
@@ -6763,7 +6763,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} else {
// Linear attention (gated delta net) specific tensors
// Create tensors with calculated dimensions
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
// note: ssm_in is used by legacy GGUF
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
+9 -2
View File
@@ -466,7 +466,8 @@ private:
ggml_tensor * cur,
int il);
ggml_tensor * build_delta_net_chunking(
// returns pair of output and new state
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
@@ -478,7 +479,8 @@ private:
ggml_tensor * diag_mask,
int il);
ggml_tensor * build_delta_net_autoregressive(
// returns pair of output and new state
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
@@ -493,6 +495,11 @@ private:
ggml_tensor * gate,
int layer);
// returns pair of qkv, z
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
ggml_tensor * input,
int il);
const llama_model & model;
};
+199 -183
View File
@@ -86,7 +86,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
ggml_build_forward_expand(gf, cur);
}
ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
// utility to get one slice from the third dimension
// input dim: [x, y, c, b]
// output dim: [x, y, 1, b]
static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
}
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chunking(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
@@ -187,18 +195,16 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
cb(g_cumsum, "g_cumsum", il);
ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j_broadcast =
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
cb(decay_mask, "decay_mask", il);
cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
decay_mask = ggml_exp(ctx0, decay_mask);
@@ -208,8 +214,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
cb(attn, "attn_pre_solve", il);
cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
@@ -217,8 +222,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_mul(ctx0, lin_solve, causal_mask);
attn = ggml_add(ctx0, attn, identity);
cb(attn, "attn_solved", il);
cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
@@ -226,116 +230,126 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
cb(kbeta_gexp, "kbeta_gexp", il);
cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * k_cumdecay =
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
cb(k_cumdecay, "k_cumdecay", il);
ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
// vectorized calculation of key_gdiff
// improved from the chunked version:
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
// key_gdiff = key * g_diff.unsqueeze(-1)
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
// get last element in g_cumsum along chunk_size dimension (ne0)
// example: [[x, y, z, ..., last], ...] -> [[last], ...]
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
(g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
g_last = ggml_cont(ctx0, g_last);
cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp);
cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
// state to be updated per chunk
ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
// shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * core_attn_out = nullptr;
ggml_tensor * new_state = ggml_dup(ctx0, state);
cb(new_state, "new_state", il);
for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
auto chunkify = [=](ggml_tensor * t) {
return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
};
// shape: (S_k, chunk_size, 1, H_k * n_seqs)
ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
auto chunkify_g = [=](ggml_tensor * t) {
return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, t->ne[1], 1, t->ne[3],
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
};
// shape: (S_v, chunk_size, 1, H_v * n_seqs)
ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
ggml_tensor * k_chunk = chunkify(k);
ggml_tensor * q_chunk = chunkify(q);
ggml_tensor * v_chunk = chunkify(v);
// shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
ggml_tensor * g_cs_chunk = chunkify_g(g_cumsum);
ggml_tensor * g_cs_chunk_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cs_chunk));
ggml_tensor * decay_mask_chunk = chunkify(decay_mask);
ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
ggml_tensor * gexp_chunk = ggml_exp(ctx0, g_cs_chunk_t);
// shape: (chunk_size, 1, H_v * n_seqs)
ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
attn = ggml_mul(ctx0, attn, decay_mask_chunk);
attn = ggml_mul(ctx0, attn, diag_mask);
// replaced by precomputed attn_kq
ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
cb(attn_chunk, "attn_chunk", il);
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
// v_new = v_i - v_prime
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
cb(v_new, "v_new_chunk", il);
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
cb(attn_inter, "attn_inter_chunk", il);
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
cb(v_attn, "v_attn_chunk", il);
ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
core_attn_out = core_attn_out == nullptr
? core_attn_out_chunk
: ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
// key_gdiff = key * g_diff.unsqueeze(-1)
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk));
//ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff)));
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * g_cum_last =
ggml_cont(ctx0, ggml_view_4d(ctx0, g_cs_chunk_t, g_cs_chunk_t->ne[0], 1, g_cs_chunk_t->ne[2], g_cs_chunk_t->ne[3],
g_cs_chunk_t->nb[1], g_cs_chunk_t->nb[2], g_cs_chunk_t->nb[3],
g_cs_chunk_t->nb[0] * (g_cs_chunk_t->ne[1] - 1)));
ggml_tensor * gexp_last =
ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
ggml_tensor * g_cum_last_3d =
ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cs_chunk, g_cs_chunk->ne[0], g_cs_chunk->ne[2], g_cs_chunk->ne[3]);
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k_chunk,
ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
g_diff_exp->ne[2] * g_diff_exp->ne[3]));
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
new_state = ggml_add(ctx0,
ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last, gexp_last->ne[0], gexp_last->ne[1], H_v, n_seqs)),
ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
}
core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0);
// truncate padded tokens
ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
S_v, n_tokens, H_v, n_seqs,
ggml_row_size(core_attn_out->type, S_v),
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
output_tokens = ggml_cont(ctx0, output_tokens);
cb(output_tokens, "output_tokens", il);
// flatten output
ggml_tensor * flat_output =
ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
// permute back to (S_v, H_v, n_tokens, n_seqs)
output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
output_tokens = ggml_cont(ctx0, output_tokens);
ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs);
return ggml_concat(ctx0, flat_output, flat_state, 0);
return {output_tokens, new_state};
}
ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
@@ -419,11 +433,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
cb(core_attn_out, "output_tokens", il);
cb(state, "new_state", il);
// flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
return ggml_concat(ctx0, flat_output, flat_state, 0);
return {core_attn_out, state};
}
ggml_tensor * llm_build_qwen3next::build_norm_gated(
@@ -523,6 +533,88 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
return cur;
}
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
ggml_tensor * input,
int il) {
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t n_seqs = ubatch.n_seqs;
const int64_t head_k_dim = hparams.ssm_d_state;
const int64_t num_k_heads = hparams.ssm_n_group;
const int64_t num_v_heads = hparams.ssm_dt_rank;
const int64_t head_v_dim = d_inner / num_v_heads;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
if (model.layers[il].wqkv) {
// optimized path
ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
cb(qkv_mixed, "linear_attn_qkv_mixed", il);
ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
cb(z, "z", il);
return { qkv_mixed, z };
} else {
// legacy (slower) path
ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
// Split mixed_qkvz into query, key, value, z
int64_t split_sizes_qkvz[4] = {
head_k_dim, // query size
head_k_dim, // key size
head_v_dim * num_v_heads / num_k_heads, // value size
head_v_dim * num_v_heads / num_k_heads // z size
};
ggml_tensor * query =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
cb(query, "q", il);
ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped));
cb(key, "k", il);
ggml_tensor * value =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
(split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped));
cb(value, "v", il);
ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped));
z = ggml_cont(ctx0, z);
cb(z, "z", il);
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
cb(query_flat, "query_flat", il);
// key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
cb(key_flat, "key_flat", il);
// value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
cb(value_flat, "value_flat", il);
// Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
cb(qkv_mixed, "qkv_mixed", il);
return { qkv_mixed, z };
}
}
ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
@@ -547,15 +639,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
// Input projections
ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, cur);
cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
auto qkvz = build_qkvz(cur, il);
ggml_tensor * qkv_mixed = qkvz.first;
ggml_tensor * z = qkvz.second;
ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
cb(mixed_ba, "linear_attn_mixed_ba", il);
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
@@ -575,8 +665,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
cb(a, "a", il);
// Reshape b and a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
ggml_tensor * beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
// Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
@@ -585,48 +676,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
cb(gate, "gate", il);
// Split mixed_qkvz into query, key, value, z
int64_t split_sizes_qkvz[4] = {
head_k_dim, // query size
head_k_dim, // key size
head_v_dim * num_v_heads / num_k_heads, // value size
head_v_dim * num_v_heads / num_k_heads // z size
};
ggml_tensor * query =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
cb(query, "q", il);
ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
split_sizes_qkvz[0] * sizeof(float));
cb(key, "k", il);
ggml_tensor * value =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
(split_sizes_qkvz[0] + split_sizes_qkvz[1]) * sizeof(float));
cb(value, "v", il);
ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
cb(z, "z", il);
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
cb(query_flat, "query_flat", il);
// key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
cb(key_flat, "key_flat", il);
// value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
cb(value_flat, "value_flat", il);
// Get convolution states from cache
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
@@ -637,17 +686,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
cb(conv_states, "conv_states", il);
// Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
cb(qkv_mixed, "qkv_mixed", il);
qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
cb(qkv_mixed, "qkv_mixed_permuted", il);
// Calculate the total conv dimension
int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
// Calculate convolution kernel size
ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
const int64_t conv_kernel_size = conv_kernel->ne[0];
@@ -655,6 +693,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
cb(conv_states, "conv_states_reshaped", il);
qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
cb(qkv_mixed, "qkv_mixed_permuted", il);
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
cb(conv_input, "conv_input", il);
@@ -677,26 +718,25 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
cb(conv_output_proper, "conv_output_raw", il);
conv_output_proper = ggml_cont(ctx0, ggml_transpose(ctx0, conv_output_proper));
cb(conv_output_proper, "conv_output_pre_silu", il);
ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
cb(conv_output_silu, "conv_output_silu", il);
ggml_tensor * conv_qkv_mix =
ggml_cont_2d(ctx0, ggml_transpose(ctx0, conv_output_silu), qkv_dim, n_seq_tokens * n_seqs);
cb(conv_qkv_mix, "conv_qkv_mix", il);
ggml_tensor * conv_qkv_mix = conv_output_silu;
// Calculate the total conv dimension
int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
// Extract the convolved Q, K, V from conv_output
ggml_tensor * q_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1], 0);
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
cb(q_conv, "q_conv", il);
ggml_tensor * k_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(k_conv, "k_conv", il);
ggml_tensor * v_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(v_conv, "v_conv", il);
@@ -705,8 +745,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
cb(state, "state_predelta", il);
@@ -738,45 +776,29 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(v_conv, "v_conv_predelta", il);
// Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
ggml_tensor * attn_out;
std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
if (n_seq_tokens == 1) {
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
} else {
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
}
cb(attn_out, "attn_out", il);
// The tensors were concatenated 1d, so we need to extract them 1d as well
const int64_t output_flat_size = head_v_dim * num_v_heads * n_seq_tokens * n_seqs;
ggml_tensor * attn_out_1d = ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
cb(attn_out_1d, "attn_out_1d", il);
ggml_tensor * attn_out_final = ggml_cont_4d(ctx0, attn_out_1d, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
cb(attn_out_final, "attn_out_reshaped", il);
// Extract the state part (second part of the concatenated tensor)
// State starts after n_tokens elements along dimension 1
const int64_t state_flat_size = head_v_dim * head_v_dim * num_v_heads * n_seqs;
ggml_tensor * state_1d =
ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out));
cb(state_1d, "state_1d", il);
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
cb(output, "attn_output", il);
cb(new_state, "new_state", il);
// Update the recurrent states
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, state_1d,
ggml_cpy(ctx0, new_state,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out));
// Reshape both attn_out_final and z to 2D tensors for normalization
// attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * attn_out_2d_final =
ggml_cont_2d(ctx0, attn_out_final, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * z_2d = ggml_cont_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
// Apply gated normalization: self.norm(core_attn_out, z)
ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
@@ -828,12 +850,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
shared_gate = ggml_sigmoid(ctx0, shared_gate);
cb(shared_gate, "shared_expert_gate_sigmoid", il);
// The gate needs to be broadcast to match the dimensions of ffn_shexp
// ffn_shexp is [n_embd, n_tokens, 1, 1] and shared_gate is [1, n_tokens, 1, 1]
// We need to repeat the gate along the feature dimension
shared_gate = ggml_repeat(ctx0, shared_gate, ffn_shexp);
cb(shared_gate, "shared_expert_gate_broadcast", il);
// Apply the gate to the shared expert output
ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
cb(ffn_shexp, "ffn_shexp_gated", il);
-9
View File
@@ -223,15 +223,6 @@ llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
llama_build_and_test(test-autorelease.cpp LABEL "model")
llama_build_and_test(test-backend-sampler.cpp LABEL "model")
llama_test(test-backend-sampler NAME test-backend-sampler-greedy ARGS --test greedy)
llama_test(test-backend-sampler NAME test-backend-sampler-temp ARGS --test temp)
llama_test(test-backend-sampler NAME test-backend-sampler-top_k ARGS --test top_k)
llama_test(test-backend-sampler NAME test-backend-sampler-dist ARGS --test dist)
llama_test(test-backend-sampler NAME test-backend-sampler-dist-and-cpu ARGS --test dist_and_cpu)
llama_test(test-backend-sampler NAME test-backend-sampler-logit-bias ARGS --test logit_bias)
llama_test(test-backend-sampler NAME test-backend-sampler-mul_seq ARGS --test multi_sequence)
llama_test(test-backend-sampler NAME test-backend-sampler-set-sampler ARGS --test set_sampler)
# Test for state restore with fragmented KV cache
# Requires a model, uses same args pattern as test-thread-safety
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+49 -2
View File
@@ -454,6 +454,28 @@ static bool ggml_is_view_op(enum ggml_op op) {
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
}
static bool backend_has_feature(ggml_backend_t backend, const char * feature_name) {
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto get_features = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
if (!get_features) {
return false;
}
const ggml_backend_feature * features = get_features(reg);
if (!features) {
return false;
}
for (const ggml_backend_feature * f = features; f->name; ++f) {
if (strcmp(f->name, feature_name) == 0 && strcmp(f->value, "1") == 0) {
return true;
}
}
return false;
}
enum test_mode {
MODE_TEST,
MODE_PERF,
@@ -1101,6 +1123,11 @@ struct test_case {
return 1e-7;
}
virtual double max_nmse_err(ggml_backend_t backend) {
GGML_UNUSED(backend);
return max_nmse_err();
}
virtual double max_maa_err() {
return 1e-4;
}
@@ -1109,6 +1136,10 @@ struct test_case {
return max_nmse_err();
}
virtual double max_err(ggml_backend_t backend) {
return max_nmse_err(backend);
}
virtual double err(const float * a, const float * b, size_t n) {
return nmse(a, b, n);
}
@@ -1378,8 +1409,8 @@ struct test_case {
}
double err = ud->tc->err(f1.data(), f2.data(), f1.size());
if (err > ud->tc->max_err()) {
printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err());
if (err > ud->tc->max_err(ud->backend1)) {
printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err(ud->backend1));
//for (int i = 0; i < (int) f1.size(); i++) {
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
//}
@@ -3686,6 +3717,14 @@ struct test_mul_mat : public test_case {
return 5e-4;
}
double max_nmse_err(ggml_backend_t backend) override {
// for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
return 2e-2;
}
return max_nmse_err();
}
int64_t grad_nmax() override {
return 20000;
}
@@ -3814,6 +3853,14 @@ struct test_mul_mat_id : public test_case {
return 5e-4;
}
double max_nmse_err(ggml_backend_t backend) override {
// for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
return 2e-2;
}
return max_nmse_err();
}
uint64_t op_flops(ggml_tensor * t) override {
GGML_UNUSED(t);
return 2 * m * k * n * n_used;
+157 -229
View File
@@ -11,76 +11,78 @@
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
struct backend_cli_args {
const char * model = nullptr;
const char * test = nullptr;
const char * device = "cpu";
struct test_args {
std::string model;
std::string test;
std::string device = "auto";
};
struct test_model_context {
llama_model_ptr model;
struct test_params {
llama_model_ptr model;
};
static llama_model_ptr load_model(const test_args & args) {
auto mparams = llama_model_default_params();
ggml_backend_dev_t devs[2] = { nullptr, nullptr };
if (args.device != "auto") {
if (args.device == "gpu") {
devs[0] = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (devs[0] == nullptr) {
fprintf(stderr, "Error: GPU requested but not available\n");
return nullptr;
}
mparams.n_gpu_layers = 999;
} else if (args.device == "cpu") {
devs[0] = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
mparams.n_gpu_layers = 0;
} else {
fprintf(stderr, "Error: invalid device '%s'\n", args.device.c_str());
return nullptr;
}
mparams.devices = devs;
fprintf(stderr, "Using device: %s\n", ggml_backend_dev_name(devs[0]));
}
llama_model_ptr res;
res.reset(llama_model_load_from_file(args.model.c_str(), mparams));
if (!res) {
fprintf(stderr, "Warning: failed to load model '%s', skipping test\n", args.model.c_str());
return nullptr;
}
return res;
}
struct test_context {
llama_context_ptr ctx;
int n_vocab = 0;
int n_vocab = 0;
const llama_vocab * vocab = nullptr;
std::unordered_map<llama_seq_id, int32_t> seq_positions;
std::unordered_map<llama_seq_id, int32_t> last_batch_info;
bool load_model(const backend_cli_args & args) {
if (model) {
return true;
}
test_context(const test_params & params, std::vector<llama_sampler_seq_config> & configs, int32_t n_seq_max = -1) {
auto * model = params.model.get();
llama_backend_init();
auto mparams = llama_model_default_params();
ggml_backend_dev_t devs[2];
if (std::string_view(args.device) == "gpu") {
ggml_backend_dev_t gpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (gpu == nullptr) {
fprintf(stderr, "Error: GPU requested but not available\n");
return false;
}
devs[0] = gpu;
devs[1] = nullptr; // null terminator
mparams.devices = devs;
mparams.n_gpu_layers = 999;
} else if (std::string_view(args.device) == "cpu") {
ggml_backend_dev_t cpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
devs[0] = cpu;
devs[1] = nullptr; // null terminator
mparams.devices = devs;
}
fprintf(stderr, "Using device: %s\n", ggml_backend_dev_name(devs[0]));
model.reset(llama_model_load_from_file(args.model, mparams));
if (!model) {
fprintf(stderr, "Warning: failed to load model '%s', skipping test\n", args.model);
return false;
}
n_vocab = llama_vocab_n_tokens(get_vocab());
fprintf(stderr, "Vocabulary size: %d\n", n_vocab);
return true;
}
bool setup(const backend_cli_args & args, std::vector<llama_sampler_seq_config> & configs, int32_t n_seq_max = -1) {
if (!model) {
load_model(args);
}
if (ctx) {
return true;
}
GGML_ASSERT(model);
GGML_ASSERT(!ctx);
llama_context_params cparams = llama_context_default_params();
cparams.n_ctx = 512;
@@ -99,26 +101,23 @@ struct test_model_context {
cparams.n_seq_max = n_seq_max;
}
ctx.reset(llama_init_from_model(model.get(), cparams));
ctx.reset(llama_init_from_model(model, cparams));
if (!ctx) {
fprintf(stderr, "Warning: failed to create context, skipping test\n");
return false;
throw std::runtime_error("failed to create context");
}
llama_set_warmup(ctx.get(), false);
return true;
vocab = llama_model_get_vocab(model);
n_vocab = llama_vocab_n_tokens(vocab);
}
bool decode(const std::map<llama_seq_id, std::string> & prompts) {
if (!ctx) {
fprintf(stderr, "Error: context not initialized, call setup() first\n");
return false;
}
GGML_ASSERT(ctx);
last_batch_info.clear();
llama_batch batch = llama_batch_init(512, 0, prompts.size());
auto vocab = get_vocab();
for (const auto & [seq_id, prompt] : prompts) {
std::vector<llama_token> tokens;
tokens.push_back(llama_vocab_bos(vocab));
@@ -199,10 +198,7 @@ struct test_model_context {
}
bool decode_token(llama_token token, llama_seq_id seq_id = 0) {
if (ctx == nullptr) {
fprintf(stderr, "Error: context not initialized, call setup() first\n");
return false;
}
GGML_ASSERT(ctx);
llama_batch batch = llama_batch_init(1, 0, 1);
int32_t pos = seq_positions[seq_id];
@@ -218,14 +214,12 @@ struct test_model_context {
seq_positions[seq_id]++;
llama_batch_free(batch);
return true;
}
bool decode_tokens(const std::map<llama_seq_id, llama_token> & seq_tokens) {
if (ctx == nullptr) {
fprintf(stderr, "Error: context not initialized, call setup() first\n");
return false;
}
GGML_ASSERT(ctx);
llama_batch batch = llama_batch_init(seq_tokens.size(), 0, seq_tokens.size());
@@ -247,40 +241,27 @@ struct test_model_context {
update_batch_info(batch);
llama_batch_free(batch);
return true;
}
std::string token_to_piece(llama_token token, bool special) {
std::string token_to_piece(llama_token token, bool special) const {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(get_vocab(), token, &piece[0], piece.size(), 0, special);
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = llama_token_to_piece(get_vocab(), token, &piece[0], piece.size(), 0, special);
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
}
else {
} else {
piece.resize(n_chars);
}
return piece;
}
void reset() {
ctx.reset();
seq_positions.clear();
last_batch_info.clear();
}
const llama_vocab * get_vocab() const {
return model ? llama_model_get_vocab(model.get()) : nullptr;
}
};
static void test_backend_greedy_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_greedy_sampling(const test_params & params) {
const int seq_id = 0;
struct llama_sampler_chain_params backend_sampler_params = llama_sampler_chain_default_params();
@@ -289,9 +270,7 @@ static void test_backend_greedy_sampling(const backend_cli_args & args) {
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_greedy());
std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Some"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -317,9 +296,7 @@ static void test_backend_greedy_sampling(const backend_cli_args & args) {
}
}
static void test_backend_top_k_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_top_k_sampling(const test_params & params) {
const int seq_id = 0;
const int32_t k = 8;
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
@@ -327,9 +304,7 @@ static void test_backend_top_k_sampling(const backend_cli_args & args) {
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_top_k(k));
std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Hello"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -358,16 +333,12 @@ static void test_backend_top_k_sampling(const backend_cli_args & args) {
llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
const std::string token_str = test_ctx.token_to_piece(token, false);
GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
printf("backend top-k hybrid sampling test PASSED\n");
}
static void test_backend_temp_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_temp_sampling(const test_params & params) {
{
const float temp_0 = 0.8f;
struct llama_sampler_chain_params backend_chain_params_0 = llama_sampler_chain_default_params();
@@ -384,9 +355,7 @@ static void test_backend_temp_sampling(const backend_cli_args & args) {
{ 1, backend_sampler_chain_1.get() }
};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{0, "Some where over the"}, {1, "Once upon a"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -430,8 +399,6 @@ static void test_backend_temp_sampling(const backend_cli_args & args) {
auto test_argmax_temp = [&](float temp) {
printf("\nTesting temperature = %.1f\n", temp);
test_ctx.reset();
int seq_id = 0;
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
@@ -441,9 +408,7 @@ static void test_backend_temp_sampling(const backend_cli_args & args) {
{ seq_id, backend_sampler_chain.get() },
};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Once"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -459,12 +424,9 @@ static void test_backend_temp_sampling(const backend_cli_args & args) {
test_argmax_temp(-1.0f);
printf("backend temp sampling test PASSED\n");
}
static void test_backend_temp_ext_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_temp_ext_sampling(const test_params & params) {
{
int seq_id = 0;
const float temp = 0.8f;
@@ -478,9 +440,7 @@ static void test_backend_temp_ext_sampling(const backend_cli_args & args) {
{ seq_id, backend_sampler_chain.get() },
};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Once upon a"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -494,14 +454,10 @@ static void test_backend_temp_ext_sampling(const backend_cli_args & args) {
}
}
test_ctx.reset();
// lambda to testing non-positive temp/delta/exponent values.
auto test_argmax_temp = [&](float temp, float delta, float exponent) {
printf("\nTesting temperature = %.1f, delta = %1.f, exponent = %1.f\n", temp, delta, exponent);
test_ctx.reset();
int seq_id = 0;
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
@@ -511,9 +467,7 @@ static void test_backend_temp_ext_sampling(const backend_cli_args & args) {
{ seq_id, backend_sampler_chain.get() },
};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Once"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -535,12 +489,9 @@ static void test_backend_temp_ext_sampling(const backend_cli_args & args) {
test_argmax_temp(0.8f, 0.0f, 2.0f); // Temperature scaling
printf("backend temp_ext sampling test PASSED\n");
}
static void test_backend_min_p_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_min_p_sampling(const test_params & params) {
const int seq_id = 0;
const float p = 0.1;
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
@@ -548,9 +499,7 @@ static void test_backend_min_p_sampling(const backend_cli_args & args) {
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_min_p(p, 0));
std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Hello"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -594,9 +543,7 @@ static void test_backend_min_p_sampling(const backend_cli_args & args) {
printf("min-p sampling test PASSED\n");
}
static void test_backend_top_p_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_top_p_sampling(const test_params & params) {
const int seq_id = 0;
const float p = 0.9;
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
@@ -604,9 +551,7 @@ static void test_backend_top_p_sampling(const backend_cli_args & args) {
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_top_p(p, 0));
std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Hello"}})) {
return;
@@ -648,9 +593,7 @@ static void test_backend_top_p_sampling(const backend_cli_args & args) {
printf("top-p sampling test PASSED\n");
}
static void test_backend_multi_sequence_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_multi_sequence_sampling(const test_params & params) {
struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_greedy());
@@ -665,9 +608,7 @@ static void test_backend_multi_sequence_sampling(const backend_cli_args & args)
{ 1, sampler_chain_1.get() }
};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
std::map<llama_seq_id, std::string> prompts = {
{0, "Hello"},
@@ -718,19 +659,16 @@ static void test_backend_multi_sequence_sampling(const backend_cli_args & args)
printf("backend multi-sequence sampling test PASSED\n");
}
static void test_backend_dist_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_dist_sampling(const test_params & params) {
const int seq_id = 189;
const int32_t seed = 88;
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Some"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -749,19 +687,16 @@ static void test_backend_dist_sampling(const backend_cli_args & args) {
printf("backend dist sampling test PASSED\n");
}
static void test_backend_dist_sampling_and_cpu(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_dist_sampling_and_cpu(const test_params & params) {
const int seq_id = 0;
const int32_t seed = 88;
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Some"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -782,31 +717,31 @@ static void test_backend_dist_sampling_and_cpu(const backend_cli_args & args) {
printf("backend dist & cpu sampling test PASSED\n");
}
static void test_backend_logit_bias_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
// Calling load_model to ensure vocab is loaded and can be accessed
if (!test_ctx.load_model(args)) {
return;
}
static void test_backend_logit_bias_sampling(const test_params & params) {
const auto * model = params.model.get();
const auto * vocab = llama_model_get_vocab(model);
const int seq_id = 0;
// Create the logit biases vector.
std::vector<llama_logit_bias> logit_bias;
// Get the token for the piece "World".
const std::string piece = "World";
std::vector<llama_token> tokens(16);
llama_tokenize(test_ctx.get_vocab(), piece.c_str(), piece.size(), tokens.data(), tokens.size(), false, false);
llama_tokenize(vocab, piece.c_str(), piece.size(), tokens.data(), tokens.size(), false, false);
llama_token bias_token = tokens[0];
logit_bias.push_back({ bias_token, +100.0f });
// TODO: biasing too much here makes the Vulkan sampling fail - should be investigated further
// https://github.com/ggml-org/llama.cpp/actions/runs/20894267644/job/60030252675?pr=18753#step:3:23350
//logit_bias.push_back({ bias_token, +100.0f });
logit_bias.push_back({ bias_token, +10.0f });
printf("biasing token piece '%s' -> token id %d\n", piece.c_str(), bias_token);
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_logit_bias(
llama_vocab_n_tokens(test_ctx.get_vocab()),
llama_vocab_n_tokens(vocab),
logit_bias.size(),
logit_bias.data()));
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(88));
@@ -815,17 +750,14 @@ static void test_backend_logit_bias_sampling(const backend_cli_args & args) {
{ seq_id, backend_sampler_chain.get() },
};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Hello"}})) {
GGML_ASSERT(false && "Failed to decode token");
}
llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), test_ctx.idx_for_seq(seq_id));
const std::string backend_token_str = test_ctx.token_to_piece(backend_token, false);
printf("logit bias sampled token = %d, string='%s'\n", backend_token, backend_token_str.c_str());
printf("sampled token = %d, expected = %d\n", backend_token, bias_token);
GGML_ASSERT(backend_token == bias_token);
printf("backend logit bias sampling test PASSED\n");
@@ -833,9 +765,7 @@ static void test_backend_logit_bias_sampling(const backend_cli_args & args) {
// This test verifies that it is possible to have two different backend sampler,
// one that uses the backend dist sampler, and another that uses CPU dist sampler.
static void test_backend_mixed_sampling(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_mixed_sampling(const test_params & params) {
struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_dist(88));
@@ -850,9 +780,7 @@ static void test_backend_mixed_sampling(const backend_cli_args & args) {
{ 1, sampler_chain_1.get() }
};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
std::map<llama_seq_id, std::string> prompts = {
{0, "Hello"},
@@ -887,19 +815,16 @@ static void test_backend_mixed_sampling(const backend_cli_args & args) {
printf("backend mixed sampling test PASSED\n");
}
static void test_backend_set_sampler(const backend_cli_args & args) {
test_model_context test_ctx;
const int32_t seed = 88;
static void test_backend_set_sampler(const test_params & params) {
const int seq_id = 0;
const int32_t seed = 88;
struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
if (!test_ctx.decode({{seq_id, "Hello"}})) {
GGML_ASSERT(false && "Failed to decode token");
@@ -955,9 +880,7 @@ static void test_backend_set_sampler(const backend_cli_args & args) {
printf("backend set sampler test PASSED\n");
}
static void test_backend_cpu_mixed_batch(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_cpu_mixed_batch(const test_params & params) {
// Sequence 0 uses backend sampling
struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
@@ -968,12 +891,10 @@ static void test_backend_cpu_mixed_batch(const backend_cli_args & args) {
};
// We need 2 sequences: seq 0 with backend sampling, seq 1 with CPU sampling
if (!test_ctx.setup(args, backend_sampler_configs, 2)) {
return;
}
test_context test_ctx(params, backend_sampler_configs, 2);
std::map<llama_seq_id, std::string> prompts = {
{0, "Hello"}, // Will use backend sampling
{0, "Hello"}, // Will use backend sampling
{1, "Some"} // Will use CPU sampling
};
@@ -1047,28 +968,25 @@ static void test_backend_cpu_mixed_batch(const backend_cli_args & args) {
printf("backend-cpu mixed batch test PASSED\n");
}
static void test_backend_max_outputs(const backend_cli_args & args) {
test_model_context test_ctx;
static void test_backend_max_outputs(const test_params & params) {
const int seq_id = 0;
const int32_t seed = 88;
llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
if (!test_ctx.setup(args, backend_sampler_configs)) {
return;
}
test_context test_ctx(params, backend_sampler_configs);
llama_batch batch = llama_batch_init(512, 0, 1);
std::string prompt = "Hello";
std::vector<llama_token> tokens;
tokens.push_back(llama_vocab_bos(test_ctx.get_vocab()));
tokens.push_back(llama_vocab_bos(test_ctx.vocab));
std::vector<llama_token> prompt_tokens(32);
int n_tokens = llama_tokenize(test_ctx.get_vocab(), prompt.c_str(), prompt.length(),
int n_tokens = llama_tokenize(test_ctx.vocab, prompt.c_str(), prompt.length(),
prompt_tokens.data(), prompt_tokens.size(),
false, false);
for (int i = 0; i < n_tokens; i++) {
@@ -1090,8 +1008,8 @@ static void test_backend_max_outputs(const backend_cli_args & args) {
}
struct backend_test_case {
const char * name;
void (*fn)(const backend_cli_args &);
std::string name;
void (*fn)(const test_params &);
bool enabled_by_default;
};
@@ -1112,8 +1030,8 @@ static const backend_test_case BACKEND_TESTS[] = {
{ "top_p", test_backend_top_p_sampling, true },
};
static backend_cli_args parse_backend_cli(int argc, char ** argv) {
backend_cli_args out;
static test_args parse_cli(int argc, char ** argv) {
test_args out;
for (int i = 1; i < argc; ++i) {
const char * arg = argv[i];
@@ -1154,7 +1072,7 @@ static backend_cli_args parse_backend_cli(int argc, char ** argv) {
out.device = arg + 9;
continue;
}
if (!out.model) {
if (out.model.empty()) {
out.model = arg;
continue;
}
@@ -1163,28 +1081,28 @@ static backend_cli_args parse_backend_cli(int argc, char ** argv) {
exit(EXIT_FAILURE);
}
if (std::strcmp(out.device, "cpu") != 0 && std::strcmp(out.device, "gpu") != 0) {
fprintf(stderr, "Invalid device '%s'. Must be 'cpu' or 'gpu'\n", out.device);
if (out.device != "cpu" && out.device != "gpu" && out.device != "auto") {
fprintf(stderr, "Invalid device '%s'. Must be 'cpu', 'gpu' or 'auto'\n", out.device.c_str());
exit(EXIT_FAILURE);
}
return out;
}
static std::vector<const backend_test_case *> collect_tests_to_run(const char * requested) {
static std::vector<const backend_test_case *> collect_tests_to_run(const std::string & requested) {
std::vector<const backend_test_case *> selected;
if (requested != nullptr) {
if (!requested.empty()) {
for (const auto & test : BACKEND_TESTS) {
if (std::strcmp(test.name, requested) == 0) {
if (test.name == requested) {
selected.push_back(&test);
break;
}
}
if (selected.empty()) {
fprintf(stderr, "Unknown test '%s'. Available tests:\n", requested);
fprintf(stderr, "Unknown test '%s'. Available tests:\n", requested.c_str());
for (const auto & test : BACKEND_TESTS) {
fprintf(stderr, " %s\n", test.name);
fprintf(stderr, " %s\n", test.name.c_str());
}
exit(EXIT_FAILURE);
}
@@ -1203,34 +1121,44 @@ static std::vector<const backend_test_case *> collect_tests_to_run(const char *
return selected;
}
static void run_tests(const std::vector<const backend_test_case *> & tests, const backend_cli_args & args) {
for (const auto * test : tests) {
fprintf(stderr, "\n=== %s ===\n", test->name);
test->fn(args);
static void run_tests(const std::vector<const backend_test_case *> & tests, const test_params & args) {
for (const auto & test : tests) {
fprintf(stderr, "\n=== %s ===\n", test->name.c_str());
try {
test->fn(args);
} catch (const std::exception & e) {
fprintf(stderr, "Error running test '%s': %s\n", test->name.c_str(), e.what());
exit(EXIT_FAILURE);
}
}
}
int main(int argc, char ** argv) {
backend_cli_args args = parse_backend_cli(argc, argv);
test_args args = parse_cli(argc, argv);
if (args.model == nullptr) {
if (args.model.empty()) {
args.model = get_model_or_exit(1, argv);
}
std::ifstream file(args.model);
if (!file.is_open()) {
fprintf(stderr, "no model '%s' found\n", args.model);
return EXIT_FAILURE;
{
std::ifstream file(args.model);
if (!file.is_open()) {
fprintf(stderr, "no model '%s' found\n", args.model.c_str());
return EXIT_FAILURE;
}
}
fprintf(stderr, "using '%s'\n", args.model);
fprintf(stderr, "using '%s'\n", args.model.c_str());
ggml_time_init();
llama_backend_init();
test_params params = {
/*.model =*/ load_model(args),
};
const std::vector<const backend_test_case *> tests = collect_tests_to_run(args.test);
if (!tests.empty()) {
run_tests(tests, args);
run_tests(tests, params);
}
return 0;
+3 -3
View File
@@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
if expect_ok:
assert res.status_code == 200
# note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
if res.status_code == 200:
assert "content" in res.body
if "timings" in res.body:
assert res.body["timings"]["predicted_n"] == n_predict
else:
assert res.status_code == 500
assert "content" not in res.body
@pytest.mark.parametrize(
+6 -1
View File
@@ -1,4 +1,5 @@
set(TARGET cpp-httplib)
license_add_file("cpp-httplib" "LICENSE")
find_package(Threads REQUIRED)
@@ -8,7 +9,7 @@ if (NOT MSVC)
target_compile_options(${TARGET} PRIVATE -w)
endif()
target_link_libraries (${TARGET} PRIVATE Threads::Threads)
target_link_libraries(${TARGET} PRIVATE Threads::Threads)
if (WIN32 AND NOT MSVC)
target_link_libraries(${TARGET} PRIVATE ws2_32)
@@ -67,6 +68,8 @@ if (LLAMA_BUILD_BORINGSSL)
set(BUILD_SHARED_LIBS ${SAVED_BUILD_SHARED_LIBS})
set(BUILD_TESTING ${SAVED_BUILD_TESTING})
license_add_file("BoringSSL" "${boringssl_SOURCE_DIR}/LICENSE")
set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE)
target_link_libraries(${TARGET} PUBLIC ssl crypto)
@@ -108,6 +111,8 @@ elseif (LLAMA_BUILD_LIBRESSL)
set(BUILD_SHARED_LIBS ${SAVED_BUILD_SHARED_LIBS})
set(BUILD_TESTING ${SAVED_BUILD_TESTING})
license_add_file("LibreSSL" "${libressl_SOURCE_DIR}/COPYING")
set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE)
target_link_libraries(${TARGET} PUBLIC ssl crypto)
@@ -19,3 +19,4 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.