mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-29 00:57:39 +02:00
Compare commits
32 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5fa9e63be8 | |||
| a4c340f974 | |||
| d0a417f3c7 | |||
| 43f2b07193 | |||
| e5d6c2554e | |||
| f0dd6a1926 | |||
| 69699be48a | |||
| 85f36e5e71 | |||
| c0a97b762e | |||
| ced44be342 | |||
| e291450b76 | |||
| 59e991c23c | |||
| ca2bb89eac | |||
| 2d451c8059 | |||
| 4753791e70 | |||
| 77d5e9a76a | |||
| d5fe4e81bd | |||
| 295354ea68 | |||
| 558a764713 | |||
| edb18b6e8f | |||
| 514c45608f | |||
| 553a5c3a9f | |||
| 13be08daf9 | |||
| 226251ed56 | |||
| 87616f0680 | |||
| 63b4911494 | |||
| c6e8cc28c1 | |||
| b10d8bfdb1 | |||
| 13b4548877 | |||
| 572b3141d3 | |||
| 7c727fbe39 | |||
| 80982e815e |
@@ -13,6 +13,7 @@ Checks: >
|
||||
-readability-magic-numbers,
|
||||
-readability-uppercase-literal-suffix,
|
||||
-readability-simplify-boolean-expr,
|
||||
-readability-math-missing-parentheses,
|
||||
clang-analyzer-*,
|
||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||
performance-*,
|
||||
|
||||
@@ -16,9 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
|
||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
||||
|
||||
+132
-52
@@ -38,6 +38,11 @@
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
std::initializer_list<enum llama_example> mmproj_examples = {
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
|
||||
};
|
||||
|
||||
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
||||
this->examples = std::move(examples);
|
||||
return *this;
|
||||
@@ -157,6 +162,10 @@ struct common_hf_file_res {
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
bool common_has_curl() {
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
#include <linux/limits.h>
|
||||
#elif defined(_WIN32)
|
||||
@@ -522,6 +531,50 @@ static bool common_download_model(
|
||||
return true;
|
||||
}
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::vector<char> res_buffer;
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
||||
auto data_vec = static_cast<std::vector<char> *>(data);
|
||||
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
|
||||
#if defined(_WIN32)
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
if (params.timeout > 0) {
|
||||
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
|
||||
}
|
||||
if (params.max_size > 0) {
|
||||
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
||||
}
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
for (const auto & header : params.headers) {
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
||||
}
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
std::string error_msg = curl_easy_strerror(res);
|
||||
throw std::runtime_error("error: cannot make GET request: " + error_msg);
|
||||
}
|
||||
|
||||
long res_code;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
||||
|
||||
return { res_code, std::move(res_buffer) };
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
||||
@@ -541,45 +594,26 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
||||
}
|
||||
|
||||
// fetch model info from Hugging Face Hub API
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::string res_str;
|
||||
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
||||
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
|
||||
std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
||||
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
||||
#if defined(_WIN32)
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
// headers
|
||||
std::vector<std::string> headers;
|
||||
headers.push_back("Accept: application/json");
|
||||
if (!bearer_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
headers.push_back("Authorization: Bearer " + bearer_token);
|
||||
}
|
||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
// make the request
|
||||
common_remote_params params;
|
||||
params.headers = headers;
|
||||
auto res = common_remote_get_content(url, params);
|
||||
long res_code = res.first;
|
||||
std::string res_str(res.second.data(), res.second.size());
|
||||
std::string ggufFile;
|
||||
std::string mmprojFile;
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
throw std::runtime_error("error: cannot make GET request to HF API");
|
||||
}
|
||||
|
||||
long res_code;
|
||||
std::string ggufFile = "";
|
||||
std::string mmprojFile = "";
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
||||
if (res_code == 200) {
|
||||
// extract ggufFile.rfilename in json, using regex
|
||||
{
|
||||
@@ -613,6 +647,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
|
||||
#else
|
||||
|
||||
bool common_has_curl() {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
||||
return false;
|
||||
@@ -635,17 +673,30 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
|
||||
return {};
|
||||
}
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
||||
if (!url.empty()) {
|
||||
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
// utils
|
||||
//
|
||||
|
||||
static void common_params_handle_model(
|
||||
struct handle_model_result {
|
||||
bool found_mmproj = false;
|
||||
common_params_model mmproj;
|
||||
};
|
||||
|
||||
static handle_model_result common_params_handle_model(
|
||||
struct common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
const std::string & model_path_default,
|
||||
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
|
||||
const std::string & model_path_default) {
|
||||
handle_model_result result;
|
||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||
{
|
||||
if (!model.hf_repo.empty()) {
|
||||
@@ -657,7 +708,12 @@ static void common_params_handle_model(
|
||||
exit(1); // built without CURL, error message already printed
|
||||
}
|
||||
model.hf_repo = auto_detected.repo;
|
||||
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
|
||||
model.hf_file = auto_detected.ggufFile;
|
||||
if (!auto_detected.mmprojFile.empty()) {
|
||||
result.found_mmproj = true;
|
||||
result.mmproj.hf_repo = model.hf_repo;
|
||||
result.mmproj.hf_file = auto_detected.mmprojFile;
|
||||
}
|
||||
} else {
|
||||
model.hf_file = model.path;
|
||||
}
|
||||
@@ -694,6 +750,8 @@ static void common_params_handle_model(
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
@@ -827,16 +885,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||
}
|
||||
|
||||
common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
||||
|
||||
// allow --mmproj to be set from -hf
|
||||
// assuming that mmproj is always in the same repo as text model
|
||||
if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
|
||||
params.mmproj.hf_repo = params.model.hf_repo;
|
||||
// handle model and download
|
||||
{
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (auto & ex : mmproj_examples) {
|
||||
if (ctx_arg.ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "");
|
||||
break;
|
||||
}
|
||||
}
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
||||
}
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "", true);
|
||||
|
||||
if (params.escape) {
|
||||
string_process_escapes(params.prompt);
|
||||
@@ -968,7 +1035,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
"llama-embedding",
|
||||
"llama-eval-callback",
|
||||
"llama-export-lora",
|
||||
"llama-gbnf-validator",
|
||||
"llama-gen-docs",
|
||||
"llama-gguf",
|
||||
"llama-gguf-hash",
|
||||
@@ -988,7 +1054,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
"llama-perplexity",
|
||||
"llama-q8dot",
|
||||
"llama-quantize",
|
||||
"llama-quantize-stats",
|
||||
"llama-qwen2vl-cli",
|
||||
"llama-retrieval",
|
||||
"llama-run",
|
||||
@@ -2095,18 +2160,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
||||
"path to a multimodal projector file. see examples/llava/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
||||
"URL to a multimodal projector file. see examples/llava/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--no-mmproj"},
|
||||
"explicitly disable multimodal projector, useful when using -hf",
|
||||
[](common_params & params) {
|
||||
params.no_mmproj = true;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--no-mmproj-offload"},
|
||||
"do not offload multimodal projector to GPU",
|
||||
[](common_params & params) {
|
||||
params.mmproj_use_gpu = false;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--image"}, "FILE",
|
||||
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
||||
@@ -2381,6 +2460,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
||||
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
|
||||
"example: unsloth/phi-4-GGUF:q4_k_m\n"
|
||||
"(default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
|
||||
@@ -78,3 +78,12 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
|
||||
// function to be used by test-arg-parser
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
bool common_has_curl();
|
||||
|
||||
struct common_remote_params {
|
||||
std::vector<std::string> headers;
|
||||
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
||||
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
||||
};
|
||||
// get remote file content, returns <http_code, raw_response_body>
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
||||
|
||||
@@ -342,6 +342,8 @@ struct common_params {
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
struct common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// embedding
|
||||
|
||||
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
|
||||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||
|
||||
if (max_items == 0) {
|
||||
return "";
|
||||
}
|
||||
if (min_items == 0 && max_items == 1) {
|
||||
return item_rule + "?";
|
||||
}
|
||||
|
||||
@@ -2554,11 +2554,12 @@ class Qwen2VLModel(TextModel):
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||
for name, data in super().get_tensors():
|
||||
if name.startswith("visual."):
|
||||
continue
|
||||
yield name, data
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("visual."):
|
||||
# skip visual tensors
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("WavTokenizerDec")
|
||||
@@ -5153,7 +5154,7 @@ class Glm4Model(TextModel):
|
||||
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
||||
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"])
|
||||
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
|
||||
@@ -21,11 +21,6 @@ else()
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
|
||||
if (NOT WIN32)
|
||||
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
||||
add_subdirectory(gbnf-validator)
|
||||
endif()
|
||||
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
add_subdirectory(gguf)
|
||||
@@ -58,10 +53,6 @@ else()
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(export-lora)
|
||||
if (NOT WIN32)
|
||||
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
||||
add_subdirectory(quantize-stats)
|
||||
endif()
|
||||
add_subdirectory(llava)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
|
||||
@@ -89,6 +89,13 @@ int main(int argc, char ** argv) {
|
||||
common_init();
|
||||
|
||||
params.embedding = true;
|
||||
|
||||
// utilize the full context
|
||||
if (params.n_batch < params.n_ctx) {
|
||||
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
|
||||
params.n_batch = params.n_ctx;
|
||||
}
|
||||
|
||||
// For non-causal models, batch size must be equal to ubatch size
|
||||
params.n_ubatch = params.n_batch;
|
||||
|
||||
@@ -134,7 +141,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// max batch size
|
||||
const uint64_t n_batch = params.n_batch;
|
||||
GGML_ASSERT(params.n_batch >= params.n_ctx);
|
||||
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-gbnf-validator)
|
||||
add_executable(${TARGET} gbnf-validator.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -10,6 +10,9 @@ from typing import Any, List, Optional, Set, Tuple, Union
|
||||
|
||||
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
||||
|
||||
if max_items == 0:
|
||||
return ""
|
||||
|
||||
if min_items == 0 and max_items == 1:
|
||||
return f'{item_rule}?'
|
||||
|
||||
|
||||
@@ -36,6 +36,46 @@ static uint64_t get_time_ns() {
|
||||
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
||||
}
|
||||
|
||||
static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
|
||||
if (a.pattern != b.pattern) {
|
||||
// cString comparison that may be null
|
||||
if (a.pattern == nullptr || b.pattern == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (strcmp(a.pattern, b.pattern) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (a.buft != b.buft) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
|
||||
if (a.size() != b.size()) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < a.size(); i++) {
|
||||
if (!tensor_buft_override_equal(a[i], b[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
|
||||
if (a.size() != b.size()) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < a.size(); i++) {
|
||||
if (!vec_tensor_buft_override_equal(a[i], b[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
|
||||
std::ostringstream str;
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
@@ -175,6 +215,7 @@ struct cmd_params {
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> flash_attn;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
||||
std::vector<bool> use_mmap;
|
||||
std::vector<bool> embeddings;
|
||||
ggml_numa_strategy numa;
|
||||
@@ -207,6 +248,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* no_kv_offload */ { false },
|
||||
/* flash_attn */ { false },
|
||||
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
|
||||
/* use_mmap */ { true },
|
||||
/* embeddings */ { false },
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
@@ -265,6 +307,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
||||
join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||
@@ -557,6 +600,87 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
params.tensor_split.push_back(tensor_split);
|
||||
}
|
||||
} else if (arg == "-ot" || arg == "--override-tensor") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto value = argv[i];
|
||||
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
if (buft_list.empty()) {
|
||||
// enumerate all the devices and add their buffer types to the list
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto * buft = ggml_backend_dev_buffer_type(dev);
|
||||
if (buft) {
|
||||
buft_list[ggml_backend_buft_name(buft)] = buft;
|
||||
}
|
||||
}
|
||||
}
|
||||
auto override_group_span_len = std::strcspn(value, ",");
|
||||
bool last_group = false;
|
||||
do {
|
||||
if (override_group_span_len == 0) {
|
||||
// Adds an empty override-tensors for an empty span
|
||||
params.tensor_buft_overrides.push_back({{}});
|
||||
if (value[override_group_span_len] == '\0') {
|
||||
value = &value[override_group_span_len];
|
||||
last_group = true;
|
||||
} else {
|
||||
value = &value[override_group_span_len + 1];
|
||||
override_group_span_len = std::strcspn(value, ",");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// Stamps null terminators into the argv
|
||||
// value for this option to avoid the
|
||||
// memory leak present in the implementation
|
||||
// over in arg.cpp. Acceptable because we
|
||||
// only parse these args once in this program.
|
||||
auto override_group = value;
|
||||
if (value[override_group_span_len] == '\0') {
|
||||
value = &value[override_group_span_len];
|
||||
last_group = true;
|
||||
} else {
|
||||
value[override_group_span_len] = '\0';
|
||||
value = &value[override_group_span_len + 1];
|
||||
}
|
||||
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
|
||||
auto override_span_len = std::strcspn(override_group, ";");
|
||||
while (override_span_len > 0) {
|
||||
auto override = override_group;
|
||||
if (override_group[override_span_len] != '\0') {
|
||||
override_group[override_span_len] = '\0';
|
||||
override_group = &override_group[override_span_len + 1];
|
||||
} else {
|
||||
override_group = &override_group[override_span_len];
|
||||
}
|
||||
auto tensor_name_span_len = std::strcspn(override, "=");
|
||||
if (tensor_name_span_len >= override_span_len) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
override[tensor_name_span_len] = '\0';
|
||||
auto tensor_name = override;
|
||||
auto buffer_type = &override[tensor_name_span_len + 1];
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
printf("Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
printf(" %s\n", ggml_backend_buft_name(it.second));
|
||||
}
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
|
||||
override_span_len = std::strcspn(override_group, ";");
|
||||
}
|
||||
if (invalid_param) {
|
||||
break;
|
||||
}
|
||||
group_tensor_buft_overrides.push_back({nullptr,nullptr});
|
||||
params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
|
||||
override_group_span_len = std::strcspn(value, ",");
|
||||
} while (!last_group);
|
||||
} else if (arg == "-r" || arg == "--repetitions") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -648,6 +772,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.tensor_split.empty()) {
|
||||
params.tensor_split = cmd_params_defaults.tensor_split;
|
||||
}
|
||||
if (params.tensor_buft_overrides.empty()) {
|
||||
params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
|
||||
}
|
||||
if (params.use_mmap.empty()) {
|
||||
params.use_mmap = cmd_params_defaults.use_mmap;
|
||||
}
|
||||
@@ -689,6 +816,7 @@ struct cmd_params_instance {
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
bool use_mmap;
|
||||
bool embeddings;
|
||||
|
||||
@@ -733,13 +861,20 @@ struct cmd_params_instance {
|
||||
mparams.tensor_split = tensor_split.data();
|
||||
mparams.use_mmap = use_mmap;
|
||||
|
||||
if (tensor_buft_overrides.empty()) {
|
||||
mparams.tensor_buft_overrides = nullptr;
|
||||
} else {
|
||||
GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides.data();
|
||||
}
|
||||
|
||||
return mparams;
|
||||
}
|
||||
|
||||
bool equal_mparams(const cmd_params_instance & other) const {
|
||||
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
|
||||
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
|
||||
tensor_split == other.tensor_split;
|
||||
tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
|
||||
}
|
||||
|
||||
llama_context_params to_llama_cparams() const {
|
||||
@@ -769,6 +904,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & sm : params.split_mode)
|
||||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & ot : params.tensor_buft_overrides)
|
||||
for (const auto & mmp : params.use_mmap)
|
||||
for (const auto & embd : params.embeddings)
|
||||
for (const auto & nb : params.n_batch)
|
||||
@@ -804,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
};
|
||||
@@ -833,6 +970,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
};
|
||||
@@ -862,6 +1000,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
};
|
||||
@@ -896,6 +1035,7 @@ struct test {
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
bool use_mmap;
|
||||
bool embeddings;
|
||||
int n_prompt;
|
||||
@@ -927,6 +1067,7 @@ struct test {
|
||||
no_kv_offload = inst.no_kv_offload;
|
||||
flash_attn = inst.flash_attn;
|
||||
tensor_split = inst.tensor_split;
|
||||
tensor_buft_overrides = inst.tensor_buft_overrides;
|
||||
use_mmap = inst.use_mmap;
|
||||
embeddings = inst.embeddings;
|
||||
n_prompt = inst.n_prompt;
|
||||
@@ -972,9 +1113,9 @@ struct test {
|
||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
||||
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
|
||||
"embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
|
||||
"avg_ts", "stddev_ts",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
||||
"use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns",
|
||||
"stddev_ns", "avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
@@ -1000,6 +1141,7 @@ struct test {
|
||||
|
||||
std::vector<std::string> get_values() const {
|
||||
std::string tensor_split_str;
|
||||
std::string tensor_buft_overrides_str;
|
||||
int max_nonzero = 0;
|
||||
for (size_t i = 0; i < llama_max_devices(); i++) {
|
||||
if (tensor_split[i] > 0) {
|
||||
@@ -1014,6 +1156,26 @@ struct test {
|
||||
tensor_split_str += "/";
|
||||
}
|
||||
}
|
||||
if (tensor_buft_overrides.size() == 1) {
|
||||
// Last element of tensor_buft_overrides is always a null pattern
|
||||
// so if it is only one element long, it must be a null pattern.
|
||||
GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
|
||||
tensor_buft_overrides_str += "none";
|
||||
} else {
|
||||
for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
|
||||
// Last element of tensor_buft_overrides is always a null pattern
|
||||
if (tensor_buft_overrides[i].pattern == nullptr) {
|
||||
tensor_buft_overrides_str += "none";
|
||||
} else {
|
||||
tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
|
||||
tensor_buft_overrides_str += "=";
|
||||
tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
|
||||
}
|
||||
if (i + 2 < tensor_buft_overrides.size()) {
|
||||
tensor_buft_overrides_str += ";";
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<std::string> values = { build_commit,
|
||||
std::to_string(build_number),
|
||||
cpu_info,
|
||||
@@ -1037,6 +1199,7 @@ struct test {
|
||||
std::to_string(no_kv_offload),
|
||||
std::to_string(flash_attn),
|
||||
tensor_split_str,
|
||||
tensor_buft_overrides_str,
|
||||
std::to_string(use_mmap),
|
||||
std::to_string(embeddings),
|
||||
std::to_string(n_prompt),
|
||||
@@ -1254,6 +1417,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
if (field == "tensor_buft_overrides") {
|
||||
return "ot";
|
||||
}
|
||||
return field;
|
||||
}
|
||||
|
||||
@@ -1307,6 +1473,9 @@ struct markdown_printer : public printer {
|
||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||
fields.emplace_back("tensor_split");
|
||||
}
|
||||
if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
|
||||
fields.emplace_back("tensor_buft_overrides");
|
||||
}
|
||||
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
||||
fields.emplace_back("use_mmap");
|
||||
}
|
||||
|
||||
+18
-19
@@ -17,22 +17,15 @@
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_USE_SILU "clip.use_silu"
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
||||
#define KEY_N_EMBD "clip.vision.embedding_length"
|
||||
#define KEY_N_FF "clip.vision.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.vision.block_count"
|
||||
#define KEY_N_HEAD "clip.vision.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon"
|
||||
#define KEY_PROJ_DIM "clip.vision.projection_dim"
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
@@ -41,9 +34,14 @@
|
||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
|
||||
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
|
||||
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
|
||||
|
||||
//
|
||||
@@ -62,6 +60,7 @@
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
@@ -90,20 +89,19 @@
|
||||
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||
#define TN_GLM_BOI_W "adapter.boi"
|
||||
#define TN_GLM_EOI_W "adapter.eoi"
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_MINICPMV,
|
||||
PROJECTOR_TYPE_GLM_EDGE,
|
||||
PROJECTOR_TYPE_MERGER,
|
||||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -111,9 +109,10 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
|
||||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
||||
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
|
||||
+717
-440
File diff suppressed because it is too large
Load Diff
@@ -114,8 +114,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
|
||||
|
||||
@@ -40,7 +40,8 @@ static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
||||
" -m and --mmproj are required\n"
|
||||
" -hf user/repo can replace both -m and --mmproj in most cases\n"
|
||||
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
||||
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
|
||||
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
|
||||
argv[0]
|
||||
);
|
||||
}
|
||||
@@ -112,10 +113,10 @@ struct mtmd_cli_context {
|
||||
void init_vision_context(common_params & params) {
|
||||
const char * clip_path = params.mmproj.path.c_str();
|
||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
|
||||
/* use_gpu */ true,
|
||||
/* use_gpu */ params.mmproj_use_gpu,
|
||||
/* timings */ true,
|
||||
/* n_threads */ params.cpuparams.n_threads,
|
||||
/* verbosity */ GGML_LOG_LEVEL_INFO,
|
||||
/* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
|
||||
}));
|
||||
if (!ctx_vision.get()) {
|
||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||
@@ -261,6 +262,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (params.mmproj.path.empty()) {
|
||||
show_additional_info(argc, argv);
|
||||
LOG_ERR("ERR: Missing --mmproj argument\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
|
||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
|
||||
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
|
||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
import argparse
|
||||
from typing import Dict
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers import (
|
||||
Qwen2VLForConditionalGeneration,
|
||||
Qwen2VLProcessor,
|
||||
AutoProcessor,
|
||||
Qwen2VLConfig
|
||||
Qwen2VLConfig,
|
||||
Qwen2VLProcessor,
|
||||
Qwen2VLForConditionalGeneration,
|
||||
Qwen2_5_VLConfig, # type: ignore[reportAttributeAccessIssue]
|
||||
Qwen2_5_VLForConditionalGeneration, # type: ignore[reportAttributeAccessIssue]
|
||||
)
|
||||
|
||||
|
||||
@@ -19,61 +21,93 @@ def k(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def to_gguf_name(name: str) -> str:
|
||||
og = name
|
||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||
name = name.replace("attn.", "attn_")
|
||||
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
|
||||
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
|
||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||
name = name.replace("merger.mlp", 'mm')
|
||||
print(f"[to_gguf_name] {og} --> {name}")
|
||||
return name
|
||||
def get_n_wa_pattern(fullatt_block_indexes: Optional[List[int]]):
|
||||
if fullatt_block_indexes is None:
|
||||
return 0
|
||||
n_wa = fullatt_block_indexes[0]
|
||||
for a, b in zip(fullatt_block_indexes, fullatt_block_indexes[1:]):
|
||||
if b - a - 1 != n_wa:
|
||||
raise ValueError(
|
||||
f"window/full attention layer should have fix pattern of "
|
||||
f"for each full-attention layer followed by {n_wa} window-attention layers"
|
||||
)
|
||||
return n_wa + 1
|
||||
|
||||
|
||||
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||
vision_model = qwen2vl.visual
|
||||
tensor_map = {}
|
||||
for name, ten in vision_model.state_dict().items():
|
||||
ten = ten.numpy()
|
||||
if 'qkv' in name:
|
||||
if ten.ndim == 2: # weight
|
||||
c3, _ = ten.shape
|
||||
else: # bias
|
||||
c3 = ten.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = ten[:c]
|
||||
wk = ten[c: c * 2]
|
||||
wv = ten[c * 2:]
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
||||
elif 'merger' in name:
|
||||
if name.endswith("ln_q.weight"):
|
||||
tensor_map['v.post_ln.weight'] = ten
|
||||
elif name.endswith("ln_q.bias"):
|
||||
tensor_map['v.post_ln.bias'] = ten
|
||||
class VL2:
|
||||
|
||||
@staticmethod
|
||||
def to_gguf_name(name: str) -> str:
|
||||
og = name
|
||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||
name = name.replace("attn.", "attn_")
|
||||
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
|
||||
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
|
||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||
name = name.replace("merger.mlp", 'mm')
|
||||
print(f"[to_gguf_name] {og} --> {name}")
|
||||
return name
|
||||
|
||||
@classmethod
|
||||
def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||
vision_model = qwen2vl.visual
|
||||
tensor_map = {}
|
||||
for name, ten in vision_model.state_dict().items():
|
||||
ten = ten.numpy()
|
||||
if 'qkv' in name:
|
||||
if ten.ndim == 2: # weight
|
||||
c3, _ = ten.shape
|
||||
else: # bias
|
||||
c3 = ten.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = ten[:c]
|
||||
wk = ten[c: c * 2]
|
||||
wv = ten[c * 2:]
|
||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
||||
elif 'merger' in name:
|
||||
if name.endswith("ln_q.weight"):
|
||||
tensor_map['v.post_ln.weight'] = ten
|
||||
elif name.endswith("ln_q.bias"):
|
||||
tensor_map['v.post_ln.bias'] = ten
|
||||
else:
|
||||
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
||||
tensor_map[cls.to_gguf_name(name)] = ten
|
||||
elif 'patch_embed.proj.weight' in name:
|
||||
# NOTE: split Conv3D into Conv2Ds
|
||||
c1, c2, kt, kh, kw = ten.shape
|
||||
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
||||
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
||||
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
||||
else:
|
||||
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
||||
tensor_map[to_gguf_name(name)] = ten
|
||||
elif 'patch_embed.proj.weight' in name:
|
||||
# NOTE: split Conv3D into Conv2Ds
|
||||
c1, c2, kt, kh, kw = ten.shape
|
||||
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
||||
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
||||
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
||||
else:
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
|
||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
|
||||
|
||||
for new_name, ten in tensor_map.items():
|
||||
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
||||
tensor_map[new_name] = ten.astype(np.float32)
|
||||
else:
|
||||
tensor_map[new_name] = ten.astype(dtype)
|
||||
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
|
||||
return tensor_map
|
||||
for new_name, ten in tensor_map.items():
|
||||
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
||||
tensor_map[new_name] = ten.astype(np.float32)
|
||||
else:
|
||||
tensor_map[new_name] = ten.astype(dtype)
|
||||
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
|
||||
return tensor_map
|
||||
|
||||
|
||||
class VL25(VL2):
|
||||
|
||||
@staticmethod
|
||||
def to_gguf_name(name: str) -> str:
|
||||
og = name
|
||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||
name = name.replace("attn.", "attn_")
|
||||
name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
|
||||
name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
|
||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||
name = name.replace("merger.mlp", 'mm')
|
||||
print(f"[vl25][to_gguf_name] {og} --> {name}")
|
||||
return name
|
||||
|
||||
|
||||
def main(args):
|
||||
@@ -82,7 +116,7 @@ def main(args):
|
||||
np_dtype = np.float32
|
||||
ftype = 0
|
||||
elif args.data_type == 'fp16':
|
||||
dtype = torch.float32
|
||||
dtype = torch.float16
|
||||
np_dtype = np.float16
|
||||
ftype = 1
|
||||
else:
|
||||
@@ -92,11 +126,18 @@ def main(args):
|
||||
model_path = ""
|
||||
model_name = args.model_name
|
||||
print("model_name: ", model_name)
|
||||
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=dtype, device_map="cpu"
|
||||
)
|
||||
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||
vcfg = cfg.vision_config
|
||||
if args.model_type == "qwen2vl":
|
||||
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=dtype, device_map="cpu"
|
||||
)
|
||||
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||
vcfg = cfg.vision_config
|
||||
else:
|
||||
qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=dtype, device_map="cpu"
|
||||
)
|
||||
cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||
vcfg = cfg.vision_config
|
||||
|
||||
if os.path.isdir(model_name):
|
||||
local_model = True
|
||||
@@ -113,7 +154,6 @@ def main(args):
|
||||
fout.add_bool("clip.has_text_encoder", False)
|
||||
fout.add_bool("clip.has_vision_encoder", True)
|
||||
fout.add_bool("clip.has_qwen2vl_merger", True)
|
||||
fout.add_string("clip.projector_type", "qwen2vl_merger")
|
||||
|
||||
print(cfg.vision_config)
|
||||
if 'silu' in cfg.vision_config.hidden_act.lower():
|
||||
@@ -125,14 +165,25 @@ def main(args):
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
|
||||
if args.model_type == "qwen2.5vl":
|
||||
fout.add_uint32("clip.vision.n_wa_pattern", get_n_wa_pattern(vcfg.fullatt_block_indexes))
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
|
||||
fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
|
||||
fout.add_string("clip.projector_type", "qwen2.5vl_merger")
|
||||
else:
|
||||
fout.add_string("clip.projector_type", "qwen2vl_merger")
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
||||
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
||||
|
||||
if args.model_type == "qwen2.5vl":
|
||||
tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
|
||||
else:
|
||||
tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
|
||||
for name, data in tensor_map.items():
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
|
||||
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
||||
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
|
||||
@@ -160,6 +211,7 @@ def main(args):
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
|
||||
parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
|
||||
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@@ -23,6 +23,9 @@
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
|
||||
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
||||
@@ -367,14 +370,14 @@ static void debug_test_mrope_2d() {
|
||||
// 1. Initialize backend
|
||||
ggml_backend_t backend = NULL;
|
||||
std::string backend_name = "";
|
||||
#ifdef GGML_USE_CUDA
|
||||
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||
backend = ggml_backend_cuda_init(0); // init device 0
|
||||
backend_name = "cuda";
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||
}
|
||||
#endif
|
||||
// #ifdef GGML_USE_CUDA
|
||||
// fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||
// backend = ggml_backend_cuda_init(0); // init device 0
|
||||
// backend_name = "cuda";
|
||||
// if (!backend) {
|
||||
// fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||
// }
|
||||
// #endif
|
||||
// if there aren't GPU Backends fallback to CPU backend
|
||||
if (!backend) {
|
||||
backend = ggml_backend_cpu_init();
|
||||
@@ -483,28 +486,82 @@ static void debug_test_mrope_2d() {
|
||||
ggml_backend_free(backend);
|
||||
}
|
||||
|
||||
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
|
||||
int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
|
||||
int ne = n_embd * 4;
|
||||
float vals[56 * 56 * 3];
|
||||
enum model_output_type {
|
||||
conv3d,
|
||||
patch_embed,
|
||||
patch_win_attn_scatter,
|
||||
first_attn_layer,
|
||||
last_attn_layer,
|
||||
attn_softmax,
|
||||
final_layer,
|
||||
};
|
||||
|
||||
static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) {
|
||||
constexpr int ih = 140;
|
||||
constexpr int iw = 196;
|
||||
// constexpr int ih = 56;
|
||||
// constexpr int iw = 56;
|
||||
// int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
|
||||
int n_embd = 1280;
|
||||
int merge = 1;
|
||||
if (output_type == model_output_type::final_layer) {
|
||||
n_embd = 2048;
|
||||
merge = 2;
|
||||
}
|
||||
else if (output_type == model_output_type::attn_softmax) {
|
||||
merge = 1;
|
||||
n_embd = (ih/14/merge) * (iw/14/merge) * 16;
|
||||
}
|
||||
|
||||
int ne = (ih/14/merge) * (iw/14/merge) * n_embd;
|
||||
float vals[iw * ih * 3];
|
||||
// float embd[ne];
|
||||
std::vector<float> embd;
|
||||
embd.resize(ne);
|
||||
|
||||
for (int i = 0; i < 56*56; i++)
|
||||
for (int i = 0; i < iw*ih; i++)
|
||||
{
|
||||
for (int c = 0; c < 3; c++)
|
||||
vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
|
||||
vals[i * 3 + c] = (float)i / (iw*ih);
|
||||
}
|
||||
|
||||
clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
|
||||
clip_encode_float_image(ctx_llava->ctx_clip, 8, vals, ih, iw, embd.data());
|
||||
|
||||
std::ofstream outFile("img_embed.bin", std::ios::binary);
|
||||
std::string file_postfix = "";
|
||||
switch (output_type)
|
||||
{
|
||||
case model_output_type::conv3d:
|
||||
file_postfix = "conv3d";
|
||||
break;
|
||||
case model_output_type::patch_embed:
|
||||
file_postfix = "patch_embed";
|
||||
break;
|
||||
case model_output_type::patch_win_attn_scatter:
|
||||
file_postfix = "scatter";
|
||||
break;
|
||||
case model_output_type::first_attn_layer:
|
||||
file_postfix = "first_attn";
|
||||
break;
|
||||
case model_output_type::last_attn_layer:
|
||||
file_postfix = "last_attn";
|
||||
break;
|
||||
case model_output_type::attn_softmax:
|
||||
file_postfix = "attn_softmax";
|
||||
break;
|
||||
case model_output_type::final_layer:
|
||||
file_postfix = "final";
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
auto output_path = "img_embed_" + file_postfix + ".bin";
|
||||
|
||||
std::ofstream outFile(output_path, std::ios::binary);
|
||||
if (outFile.is_open()) {
|
||||
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
|
||||
|
||||
outFile.close();
|
||||
std::cout << "Data successfully written to mrope.bin" << std::endl;
|
||||
std::cout << "Data successfully written to ::[ " << output_path << std::endl;
|
||||
} else {
|
||||
std::cerr << "Error opening file!" << std::endl;
|
||||
}
|
||||
@@ -551,8 +608,9 @@ int main(int argc, char ** argv) {
|
||||
} else if (params.image[0].empty()) {
|
||||
auto ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
debug_test_mrope_2d();
|
||||
debug_dump_img_embed(ctx_llava);
|
||||
// debug_test_mrope_2d();
|
||||
debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
|
||||
// debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
ctx_llava->model = NULL;
|
||||
|
||||
@@ -55,6 +55,7 @@ add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # mode
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-qwen2vl-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
|
||||
# to test the big models, run: ./tests.sh big
|
||||
add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
set(TARGET llama-quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -2,6 +2,9 @@
|
||||
const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}';
|
||||
|
||||
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
||||
if (maxItems == 0) {
|
||||
return '';
|
||||
}
|
||||
if (minItems === 0 && maxItems === 1) {
|
||||
return `${itemRule}?`;
|
||||
}
|
||||
|
||||
@@ -133,6 +133,11 @@ extern "C" {
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 1
|
||||
#define RPC_PROTO_MAJOR_VERSION 2
|
||||
#define RPC_PROTO_MINOR_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
+23
-3
@@ -393,8 +393,8 @@ extern "C" {
|
||||
|
||||
// precision
|
||||
enum ggml_prec {
|
||||
GGML_PREC_DEFAULT,
|
||||
GGML_PREC_F32,
|
||||
GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
|
||||
GGML_PREC_F32 = 10,
|
||||
};
|
||||
|
||||
// model file types
|
||||
@@ -481,6 +481,7 @@ extern "C" {
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_IM2COL,
|
||||
GGML_OP_IM2COL_BACK,
|
||||
GGML_OP_CONV_2D_DW,
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
@@ -677,6 +678,9 @@ extern "C" {
|
||||
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||
|
||||
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
||||
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
@@ -1660,7 +1664,7 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// depthwise
|
||||
// depthwise (via im2col and mul_mat)
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
@@ -1672,6 +1676,22 @@ extern "C" {
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
// Depthwise 2D convolution
|
||||
// may be faster than ggml_conv_2d_dw, but not available in all backends
|
||||
// a: KW KH 1 C convolution kernel
|
||||
// b: W H C N input data
|
||||
// res: W_out H_out C N
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int stride0,
|
||||
int stride1,
|
||||
int pad0,
|
||||
int pad1,
|
||||
int dilation0,
|
||||
int dilation1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
||||
@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_F16] = {
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.from_float = quantize_row_q8_K,
|
||||
},
|
||||
[GGML_TYPE_BF16] = {
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
@@ -1932,6 +1932,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_im2col_back_f32(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_dw(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d(params, tensor);
|
||||
@@ -2268,6 +2272,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
@@ -3161,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
__m512 x_vec = _mm512_loadu_ps(x + i);
|
||||
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
__m128 x_vec = _mm_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
__m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
|
||||
__m512 y_vec = _mm512_cvtph_ps(x_vec);
|
||||
_mm512_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
|
||||
__m256 y_vec = _mm256_cvtph_ps(x_vec);
|
||||
_mm256_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
__m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
|
||||
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
||||
_mm_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_BF16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
_mm512_storeu_ps(y + i,
|
||||
_mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(
|
||||
_mm512_cvtepu16_epi32(
|
||||
_mm256_loadu_si256(
|
||||
(const __m256i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
_mm256_storeu_ps(y + i,
|
||||
_mm256_castsi256_ps(
|
||||
_mm256_slli_epi32(
|
||||
_mm256_cvtepu16_epi32(
|
||||
_mm_loadu_si128(
|
||||
(const __m128i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
y[i] = GGML_BF16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx(void) {
|
||||
#if defined(__AVX__)
|
||||
|
||||
+174
-2
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
|
||||
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_fp16_to_fp32_row(
|
||||
ggml_cpu_fp16_to_fp32(
|
||||
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
}
|
||||
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
|
||||
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_bf16_to_fp32_row(
|
||||
ggml_cpu_bf16_to_fp32(
|
||||
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
}
|
||||
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d_dw
|
||||
|
||||
struct ggml_conv_2d_dw_params {
|
||||
int64_t channels;
|
||||
int64_t batch;
|
||||
int64_t src_w;
|
||||
int64_t src_h;
|
||||
int64_t dst_w;
|
||||
int64_t dst_h;
|
||||
int64_t knl_w;
|
||||
int64_t knl_h;
|
||||
int stride_x;
|
||||
int stride_y;
|
||||
int pad_x;
|
||||
int pad_y;
|
||||
int dilation_x;
|
||||
int dilation_y;
|
||||
};
|
||||
|
||||
static void ggml_compute_forward_conv_2d_dw_cwhn(
|
||||
const ggml_compute_params * params,
|
||||
const ggml_tensor * src,
|
||||
const ggml_tensor * kernel,
|
||||
ggml_tensor * dst,
|
||||
const ggml_conv_2d_dw_params & p) {
|
||||
|
||||
const int64_t c = p.channels;
|
||||
const float * knl_data = (const float *)kernel->data;
|
||||
|
||||
const int64_t rows_total = p.dst_h * p.batch;
|
||||
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
|
||||
const int64_t row_start = params->ith * rows_per_thread;
|
||||
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
|
||||
|
||||
#ifdef GGML_SIMD
|
||||
const int64_t pkg_size = GGML_F32_EPR;
|
||||
const int64_t pkg_count = c / pkg_size;
|
||||
const int64_t c_pkg_end = pkg_count * pkg_size;
|
||||
#else
|
||||
const int64_t c_pkg_end = 0;
|
||||
#endif
|
||||
|
||||
for (int64_t row = row_start; row < row_end; ++row) {
|
||||
const int64_t dst_y = row % p.dst_h;
|
||||
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
|
||||
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
|
||||
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
|
||||
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
|
||||
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
|
||||
|
||||
#ifdef GGML_SIMD
|
||||
// Vectorized loop
|
||||
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
|
||||
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
|
||||
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
|
||||
sum = GGML_F32_VEC_FMA(sum, k, s);
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_STORE(dst_data + c_i, sum);
|
||||
}
|
||||
#endif
|
||||
// Scalar loop
|
||||
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
|
||||
float sum = 0.0f;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
|
||||
* src_data[(src_y * p.src_w + src_x) * c + c_i];
|
||||
}
|
||||
}
|
||||
dst_data[c_i] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_dw_whcn(
|
||||
const ggml_compute_params * params,
|
||||
const ggml_tensor * src,
|
||||
const ggml_tensor * kernel,
|
||||
ggml_tensor * dst,
|
||||
const ggml_conv_2d_dw_params & p) {
|
||||
|
||||
const int64_t n = p.channels * p.batch;
|
||||
const int64_t per_thread = (n + params->nth - 1) / params->nth;
|
||||
const int64_t start = params->ith * per_thread;
|
||||
const int64_t end = MIN(start + per_thread, n);
|
||||
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
|
||||
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
|
||||
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
|
||||
|
||||
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
|
||||
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
sum += knl_data[knl_y * p.knl_w + knl_x]
|
||||
* src_data[src_y * p.src_w + src_x];
|
||||
}
|
||||
}
|
||||
dst_data[dst_y * p.dst_w + dst_x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_conv_2d_dw(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * src = dst->src[1];
|
||||
ggml_conv_2d_dw_params p;
|
||||
p.channels = src->ne[2];
|
||||
p.batch = src->ne[3];
|
||||
p.src_w = src->ne[0];
|
||||
p.src_h = src->ne[1];
|
||||
p.dst_w = dst->ne[0];
|
||||
p.dst_h = dst->ne[1];
|
||||
p.knl_w = kernel->ne[0];
|
||||
p.knl_h = kernel->ne[1];
|
||||
p.stride_x = dst->op_params[0];
|
||||
p.stride_y = dst->op_params[1];
|
||||
p.pad_x = dst->op_params[2];
|
||||
p.pad_y = dst->op_params[3];
|
||||
p.dilation_x = dst->op_params[4];
|
||||
p.dilation_y = dst->op_params[5];
|
||||
|
||||
GGML_ASSERT(kernel->ne[3] == p.channels);
|
||||
GGML_ASSERT(dst->ne[3] == p.batch);
|
||||
|
||||
if (ggml_is_contiguous(src)) {
|
||||
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
|
||||
} else if (ggml_is_contiguous_channels(src)) {
|
||||
// kernel should also have channels most contiguous in memory
|
||||
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
|
||||
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
|
||||
} else {
|
||||
GGML_ABORT("non-contiguous memory layout not supported");
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pool_1d_sk_p0
|
||||
|
||||
static void ggml_compute_forward_pool_1d_sk_p0(
|
||||
|
||||
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
|
||||
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
@@ -78,13 +78,13 @@
|
||||
// Moore Threads
|
||||
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
|
||||
|
||||
#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||
#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||
|
||||
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
|
||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
|
||||
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
|
||||
@@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(disable_indirection_for_this_node);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
@@ -1935,8 +1935,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && use_mul_mat_vec_q) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// general KQ + KQV multi-batch without FlashAttention
|
||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||
} else if (use_mul_mat_vec) {
|
||||
|
||||
+42
-38
@@ -155,25 +155,27 @@ static constexpr __device__ int get_mmq_y_device() {
|
||||
#define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8}
|
||||
|
||||
static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
|
||||
return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 :
|
||||
type == GGML_TYPE_Q4_1 ? MMQ_DP4A_TXS_Q4_1 :
|
||||
type == GGML_TYPE_Q5_0 ? MMQ_DP4A_TXS_Q8_0 :
|
||||
type == GGML_TYPE_Q5_1 ? MMQ_DP4A_TXS_Q8_1 :
|
||||
type == GGML_TYPE_Q8_0 ? MMQ_DP4A_TXS_Q8_0 :
|
||||
type == GGML_TYPE_Q2_K ? MMQ_DP4A_TXS_Q2_K :
|
||||
type == GGML_TYPE_Q3_K ? MMQ_DP4A_TXS_Q3_K :
|
||||
type == GGML_TYPE_Q4_K ? MMQ_DP4A_TXS_Q4_K :
|
||||
type == GGML_TYPE_Q5_K ? MMQ_DP4A_TXS_Q5_K :
|
||||
type == GGML_TYPE_Q6_K ? MMQ_DP4A_TXS_Q6_K :
|
||||
type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 :
|
||||
type == GGML_TYPE_IQ2_XS ? MMQ_DP4A_TXS_Q8_0_16 :
|
||||
type == GGML_TYPE_IQ2_S ? MMQ_DP4A_TXS_Q8_0_16 :
|
||||
type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 :
|
||||
type == GGML_TYPE_IQ3_S ? MMQ_DP4A_TXS_Q8_0 :
|
||||
type == GGML_TYPE_IQ1_S ? MMQ_DP4A_TXS_Q8_0 :
|
||||
type == GGML_TYPE_IQ4_XS ? MMQ_DP4A_TXS_Q8_0 :
|
||||
type == GGML_TYPE_IQ4_NL ? MMQ_DP4A_TXS_Q8_0 :
|
||||
tile_x_sizes{0, 0, 0};
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0;
|
||||
case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1;
|
||||
case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1;
|
||||
case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K;
|
||||
case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K;
|
||||
case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K;
|
||||
case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K;
|
||||
case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K;
|
||||
case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16;
|
||||
case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16;
|
||||
case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0;
|
||||
default: return tile_x_sizes{0, 0, 0};
|
||||
}
|
||||
}
|
||||
|
||||
#define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4)
|
||||
@@ -189,25 +191,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
|
||||
|
||||
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
type == GGML_TYPE_Q4_1 ? MMQ_MMA_TILE_X_K_Q8_1 :
|
||||
type == GGML_TYPE_Q5_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
type == GGML_TYPE_Q5_1 ? MMQ_MMA_TILE_X_K_Q8_1 :
|
||||
type == GGML_TYPE_Q8_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
type == GGML_TYPE_Q2_K ? MMQ_MMA_TILE_X_K_Q2_K :
|
||||
type == GGML_TYPE_Q3_K ? MMQ_MMA_TILE_X_K_Q3_K :
|
||||
type == GGML_TYPE_Q4_K ? MMQ_MMA_TILE_X_K_Q8_1 :
|
||||
type == GGML_TYPE_Q5_K ? MMQ_MMA_TILE_X_K_Q8_1 :
|
||||
type == GGML_TYPE_Q6_K ? MMQ_MMA_TILE_X_K_Q6_K :
|
||||
type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
type == GGML_TYPE_IQ2_XS ? MMQ_MMA_TILE_X_K_Q3_K :
|
||||
type == GGML_TYPE_IQ2_S ? MMQ_MMA_TILE_X_K_Q3_K :
|
||||
type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
type == GGML_TYPE_IQ3_S ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
type == GGML_TYPE_IQ1_S ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
type == GGML_TYPE_IQ4_XS ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
type == GGML_TYPE_IQ4_NL ? MMQ_MMA_TILE_X_K_Q8_0 :
|
||||
0;
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
|
||||
case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
|
||||
case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K;
|
||||
case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K;
|
||||
case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K;
|
||||
case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
|
||||
|
||||
+42
-38
@@ -7,47 +7,51 @@
|
||||
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
|
||||
|
||||
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
|
||||
return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
|
||||
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
|
||||
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
|
||||
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
|
||||
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
|
||||
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
|
||||
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
|
||||
type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
|
||||
type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
|
||||
type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
|
||||
type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
|
||||
type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
|
||||
type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
|
||||
type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
|
||||
type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
|
||||
type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
|
||||
type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
|
||||
type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
|
||||
type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
|
||||
nullptr;
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
|
||||
case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
|
||||
case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
|
||||
case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1;
|
||||
case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1;
|
||||
case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1;
|
||||
case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1;
|
||||
case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1;
|
||||
case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1;
|
||||
case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1;
|
||||
case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
|
||||
case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1;
|
||||
case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1;
|
||||
case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1;
|
||||
case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1;
|
||||
case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1;
|
||||
case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
|
||||
case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
|
||||
case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
|
||||
default: return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
|
||||
1;
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
|
||||
enum mmvq_parameter_table_id {
|
||||
|
||||
@@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
|
||||
}
|
||||
|
||||
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
||||
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
|
||||
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
|
||||
// No response
|
||||
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
|
||||
uint8_t cmd_byte = cmd;
|
||||
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
|
||||
return false;
|
||||
@@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||
if (!send_data(sock->fd, input, input_size)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
||||
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
|
||||
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
|
||||
if (!send_rpc_cmd(sock, cmd, input, input_size)) {
|
||||
return false;
|
||||
}
|
||||
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
|
||||
// even if we do, we can skip sending output_size from the server for commands with known output size
|
||||
uint64_t out_size;
|
||||
@@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
||||
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
||||
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
|
||||
GGML_ASSERT(status);
|
||||
}
|
||||
|
||||
@@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
if (!server.set_tensor(input)) {
|
||||
return;
|
||||
}
|
||||
if (!send_msg(sockfd, nullptr, 0)) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RPC_CMD_SET_TENSOR_HASH: {
|
||||
|
||||
@@ -313,7 +313,6 @@ struct ggml_backend_sycl_context {
|
||||
int device;
|
||||
std::string name;
|
||||
optimize_feature opt_feature;
|
||||
bool optimized_graph=false;
|
||||
|
||||
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
||||
|
||||
@@ -494,5 +493,9 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
|
||||
|
||||
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
|
||||
|
||||
constexpr size_t ceil_div(const size_t m, const size_t n) {
|
||||
return (m + n - 1) / n;
|
||||
}
|
||||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
@@ -21,6 +21,27 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
||||
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
||||
dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
||||
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
||||
dst[i] = sycl::fabs(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
||||
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
||||
dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void gelu(const T * x, T * dst, const int k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
@@ -335,6 +356,37 @@ static void silu_sycl(const T *x, T *dst, const int k,
|
||||
});
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
||||
// hard code for now
|
||||
const int num_blocks = ceil_div(k, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
||||
sgn(x, dst, k, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
||||
// hard code for now
|
||||
const int num_blocks = ceil_div(k, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
||||
abs_op(x, dst, k, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
||||
// hard code for now
|
||||
const int num_blocks = ceil_div(k, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
||||
elu_op(x, dst, k, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void gelu_quick_sycl(const T *x, T *dst, const int k,
|
||||
queue_ptr stream) {
|
||||
@@ -574,6 +626,106 @@ static void clamp_sycl(const T *x, T *dst, const float min,
|
||||
});
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
|
||||
#else
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
#endif
|
||||
GGML_ASSERT(dst->src[0]->type == dst->type);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
switch (dst->type) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
auto data_pts = cast_data<sycl::half>(dst);
|
||||
sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
auto data_pts = cast_data<float>(dst);
|
||||
sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
|
||||
#else
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
#endif
|
||||
GGML_ASSERT(dst->src[0]->type == dst->type);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
switch (dst->type) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
auto data_pts = cast_data<sycl::half>(dst);
|
||||
abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
auto data_pts = cast_data<float>(dst);
|
||||
abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
|
||||
#else
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
#endif
|
||||
GGML_ASSERT(dst->src[0]->type == dst->type);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
switch (dst->type) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
auto data_pts = cast_data<sycl::half>(dst);
|
||||
elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
auto data_pts = cast_data<float>(dst);
|
||||
elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
@@ -1388,3 +1540,20 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
ggml_sycl_op_sgn(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
ggml_sycl_op_abs(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
ggml_sycl_op_elu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
@@ -66,5 +66,10 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
#endif // GGML_SYCL_ELEMENTWISE_HPP
|
||||
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
|
||||
#include "ggml-sycl/backend.hpp"
|
||||
#include "ggml-sycl/common.hpp"
|
||||
#include "ggml-sycl/element_wise.hpp"
|
||||
#include "ggml-sycl/presets.hpp"
|
||||
#include "ggml-sycl/gemm.hpp"
|
||||
#include "ggml-sycl/sycl_hw.hpp"
|
||||
@@ -192,7 +193,7 @@ static void ggml_check_sycl() try {
|
||||
|
||||
if (!initialized) {
|
||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
|
||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
@@ -2852,6 +2853,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||
}
|
||||
}
|
||||
|
||||
static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
||||
size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
||||
.wait()));
|
||||
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
||||
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
||||
int offset_blks = offset / sizeof(block_q4_0);
|
||||
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
|
||||
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
||||
|
||||
stream->parallel_for(
|
||||
size / sizeof(block_q4_0),
|
||||
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
||||
const int ib = i;
|
||||
|
||||
for (int j = 0; j < QK4_0/2; j ++)
|
||||
{
|
||||
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
||||
}
|
||||
*(d_ptr + ib) = x[ib].d;
|
||||
});
|
||||
|
||||
sycl::free(tmp_buf, *stream);
|
||||
}
|
||||
|
||||
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
char*data_device = (char*)src0->data;
|
||||
size_t ncols = src0->ne[0];
|
||||
size_t nrows = src0->ne[1];
|
||||
size_t size = ggml_nbytes(src0);
|
||||
|
||||
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function could be called when the OP (mul_mat) function support reorder optimizition.
|
||||
*/
|
||||
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
|
||||
ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
|
||||
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
|
||||
src0->type == GGML_TYPE_Q4_0 &&
|
||||
src1->ne[2]==1 && src1->ne[3]==1) {
|
||||
|
||||
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
|
||||
if (!extra) return; //only happen in CI/UT permute case.
|
||||
|
||||
if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
|
||||
|
||||
reorder_qw(src0, ctx->stream());
|
||||
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
||||
@@ -2914,6 +2973,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
// KQ + KQV multi-batch
|
||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||
} else if (use_dequantize_mul_mat_vec) {
|
||||
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
||||
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
|
||||
} else if (use_mul_mat_vec_q) {
|
||||
@@ -2921,6 +2981,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
} else if (use_mul_mat_q) {
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
|
||||
} else {
|
||||
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
||||
}
|
||||
}
|
||||
@@ -3295,6 +3356,15 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_EXP:
|
||||
ggml_sycl_exp(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
ggml_sycl_sgn(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_ABS:
|
||||
ggml_sycl_abs(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_ELU:
|
||||
ggml_sycl_elu(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -3545,71 +3615,8 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
||||
size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
||||
.wait()));
|
||||
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
||||
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
||||
int offset_blks = offset / sizeof(block_q4_0);
|
||||
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
|
||||
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
||||
|
||||
stream->parallel_for(
|
||||
size / sizeof(block_q4_0),
|
||||
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
||||
const int ib = i;
|
||||
|
||||
for (int j = 0; j < QK4_0/2; j ++)
|
||||
{
|
||||
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
||||
}
|
||||
*(d_ptr + ib) = x[ib].d;
|
||||
});
|
||||
|
||||
sycl::free(tmp_buf, *stream);
|
||||
}
|
||||
|
||||
static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
char*data_device = (char*)src0->data;
|
||||
size_t ncols = src0->ne[0];
|
||||
size_t nrows = src0->ne[1];
|
||||
size_t size = ggml_nbytes(src0);
|
||||
|
||||
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
||||
}
|
||||
|
||||
static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
||||
ggml_tensor *src0 = dst->src[0];
|
||||
ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
|
||||
src1->ne[2]==1 && src1->ne[3]==1) {
|
||||
reorder_qw(src0, stream);
|
||||
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
|
||||
GGML_ASSERT(extra);
|
||||
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
|
||||
}
|
||||
}
|
||||
|
||||
static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
|
||||
dpct::queue_ptr stream = ctx->stream();
|
||||
if (ctx->optimized_graph) {
|
||||
return;
|
||||
}
|
||||
ctx->optimized_graph = true;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
|
||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
@@ -3840,6 +3847,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_ABS:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
#if defined (GGML_SYCL_F16)
|
||||
return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
|
||||
#else
|
||||
|
||||
+56
-48
@@ -4,6 +4,7 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// FIXME: required here for quantization functions
|
||||
@@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
|
||||
// currently, the ggml_cpu_has_* functions are entirely compile-time
|
||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
//if (ggml_cpu_has_f16c()) {
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
for(; i + 3 < n; i += 4) {
|
||||
__m128 x_vec = _mm_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
//}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
int i = 0;
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__AVX512F__)
|
||||
//if (ggml_cpu_has_avx512()) {
|
||||
for (; i + 16 <= n; i += 16) {
|
||||
_mm512_storeu_ps(y + i,
|
||||
_mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(
|
||||
_mm512_cvtepu16_epi32(
|
||||
_mm256_loadu_si256(
|
||||
(const __m256i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
//}
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
//if (ggml_cpu_has_avx2()) {
|
||||
for (; i + 8 <= n; i += 8) {
|
||||
_mm256_storeu_ps(y + i,
|
||||
_mm256_castsi256_ps(
|
||||
_mm256_slli_epi32(
|
||||
_mm256_cvtepu16_epi32(
|
||||
_mm_loadu_si128(
|
||||
(const __m128i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
//}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
int i = 0;
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_BF16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
@@ -956,6 +915,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CONV_TRANSPOSE_1D",
|
||||
"IM2COL",
|
||||
"IM2COL_BACK",
|
||||
"CONV_2D_DW",
|
||||
"CONV_TRANSPOSE_2D",
|
||||
"POOL_1D",
|
||||
"POOL_2D",
|
||||
@@ -993,7 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1050,6 +1010,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"conv_transpose_1d(x)",
|
||||
"im2col(x)",
|
||||
"im2col_back(x)",
|
||||
"conv_2d_dw(x)",
|
||||
"conv_transpose_2d(x)",
|
||||
"pool_1d(x)",
|
||||
"pool_2d(x)",
|
||||
@@ -1087,7 +1048,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"adamw(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -1344,6 +1305,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
||||
}
|
||||
|
||||
bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
|
||||
return
|
||||
tensor->nb[0] > tensor->nb[2] &&
|
||||
tensor->nb[1] > tensor->nb[0] &&
|
||||
tensor->nb[2] == ggml_type_size(tensor->type);
|
||||
}
|
||||
|
||||
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
@@ -4050,6 +4018,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_2d_dw_direct
|
||||
|
||||
struct ggml_tensor * ggml_conv_2d_dw_direct(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int stride0,
|
||||
int stride1,
|
||||
int pad0,
|
||||
int pad1,
|
||||
int dilation0,
|
||||
int dilation1) {
|
||||
GGML_ASSERT(a->ne[2] == 1);
|
||||
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
||||
int64_t ne[4];
|
||||
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
|
||||
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
|
||||
ne[2] = b->ne[2];
|
||||
ne[3] = b->ne[3];
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
|
||||
|
||||
if (ggml_is_contiguous_channels(b)) {
|
||||
// Result will be permuted the same way as input (CWHN order)
|
||||
const int64_t type_size = ggml_type_size(result->type);
|
||||
GGML_ASSERT(ggml_blck_size(result->type) == 1);
|
||||
result->nb[0] = result->ne[2] * type_size;
|
||||
result->nb[1] = result->ne[0] * result->nb[0];
|
||||
result->nb[2] = type_size;
|
||||
}
|
||||
|
||||
int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CONV_2D_DW;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_transpose_2d_p0
|
||||
|
||||
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
||||
|
||||
+1
-1
@@ -112,7 +112,7 @@ You can use GBNF grammars:
|
||||
|
||||
- In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field
|
||||
- In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags
|
||||
- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
|
||||
- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
|
||||
|
||||
## JSON Schemas → GBNF
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
f71d538ece3fb32a04824dc6d1e73e360be9d22f
|
||||
13bcf9ce50651a8b4238ec6d136f46f2c1b23b6f
|
||||
|
||||
+3
-2
@@ -32,8 +32,9 @@ add_library(llama
|
||||
unicode.h
|
||||
)
|
||||
|
||||
target_include_directories(llama PUBLIC . ../include)
|
||||
target_compile_features (llama PUBLIC cxx_std_17) # don't bump
|
||||
target_include_directories(llama PRIVATE .)
|
||||
target_include_directories(llama PUBLIC ../include)
|
||||
target_compile_features (llama PRIVATE cxx_std_17) # don't bump
|
||||
|
||||
target_link_libraries(llama PUBLIC ggml)
|
||||
|
||||
|
||||
+7
-7
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
||||
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
||||
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
||||
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
||||
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
||||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
@@ -122,6 +122,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
}
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||
} else if (tmpl_contains("[gMASK]<sop>")) {
|
||||
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
||||
@@ -154,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
||||
} else if (tmpl_contains("[gMASK]sop")) {
|
||||
// chatglm3-6b
|
||||
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
||||
} else if (tmpl_contains("[gMASK]<sop>")) {
|
||||
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
||||
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
||||
} else if (tmpl_contains(LU8("<用户>"))) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||
@@ -437,7 +437,7 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
||||
// chatglm3-6b
|
||||
ss << "[gMASK]" << "sop";
|
||||
for (auto message : chat) {
|
||||
@@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
||||
ss << "[gMASK]" << "<sop>";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
|
||||
+2
-2
@@ -29,8 +29,8 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
||||
LLM_CHAT_TEMPLATE_COMMAND_R,
|
||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_4,
|
||||
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
||||
LLM_CHAT_TEMPLATE_GLMEDGE,
|
||||
LLM_CHAT_TEMPLATE_MINICPM,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
|
||||
+3
-14
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
ggml_backend_buffer * bbuf) const {
|
||||
float freq_scale) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
|
||||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
||||
|
||||
if (bbuf) {
|
||||
for (const auto & backend : backends) {
|
||||
// Figure out which backend KV cache belongs to
|
||||
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
|
||||
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tmp = ggml_rope_ext_inplace(ctx0, tmp,
|
||||
tmp = ggml_rope_ext(ctx0, tmp,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
|
||||
@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
0);
|
||||
|
||||
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
|
||||
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
+1
-2
@@ -170,8 +170,7 @@ private:
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
ggml_backend_buffer * bbuf) const;
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
|
||||
@@ -803,6 +803,10 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
|
||||
if (down) {
|
||||
cur = build_lora_mm(down, cur);
|
||||
if (arch == LLM_ARCH_GLM4) {
|
||||
// GLM4 seems to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
|
||||
if (down_b) {
|
||||
|
||||
@@ -10149,6 +10149,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||
|
||||
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
||||
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
||||
ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
|
||||
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
||||
|
||||
// {kv_lora_rank, n_head, n_tokens}
|
||||
|
||||
+39
-30
@@ -1,5 +1,17 @@
|
||||
llama_add_compile_flags()
|
||||
|
||||
function(llama_build source)
|
||||
if (DEFINED LLAMA_TEST_NAME)
|
||||
set(TEST_TARGET ${LLAMA_TEST_NAME})
|
||||
else()
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
endif()
|
||||
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
endfunction()
|
||||
|
||||
function(llama_test target)
|
||||
include(CMakeParseArguments)
|
||||
set(options)
|
||||
@@ -36,7 +48,7 @@ endfunction()
|
||||
# - LABEL: label for the test (defaults to main)
|
||||
# - ARGS: arguments to pass to the test executable
|
||||
# - WORKING_DIRECTORY
|
||||
function(llama_target_and_test source)
|
||||
function(llama_build_and_test source)
|
||||
include(CMakeParseArguments)
|
||||
set(options)
|
||||
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
||||
@@ -58,6 +70,7 @@ function(llama_target_and_test source)
|
||||
add_executable(${TEST_TARGET} ${source} get-model.cpp)
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||
|
||||
add_test(
|
||||
NAME ${TEST_TARGET}
|
||||
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
|
||||
@@ -68,9 +81,7 @@ function(llama_target_and_test source)
|
||||
endfunction()
|
||||
|
||||
# build test-tokenizer-0 target once and add many tests
|
||||
add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
|
||||
target_link_libraries(test-tokenizer-0 PRIVATE common)
|
||||
install(TARGETS test-tokenizer-0 RUNTIME)
|
||||
llama_build(test-tokenizer-0.cpp)
|
||||
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
||||
@@ -87,27 +98,27 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
endif ()
|
||||
|
||||
if (NOT WIN32)
|
||||
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
|
||||
llama_target_and_test(test-sampling.cpp)
|
||||
llama_target_and_test(test-grammar-parser.cpp)
|
||||
llama_target_and_test(test-grammar-integration.cpp)
|
||||
llama_target_and_test(test-llama-grammar.cpp)
|
||||
llama_target_and_test(test-chat.cpp)
|
||||
llama_build_and_test(test-sampling.cpp)
|
||||
llama_build_and_test(test-grammar-parser.cpp)
|
||||
llama_build_and_test(test-grammar-integration.cpp)
|
||||
llama_build_and_test(test-llama-grammar.cpp)
|
||||
llama_build_and_test(test-chat.cpp)
|
||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
||||
endif()
|
||||
|
||||
llama_build(test-quantize-stats.cpp)
|
||||
llama_build(test-gbnf-validator.cpp)
|
||||
|
||||
# build test-tokenizer-1-bpe target once and add many tests
|
||||
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
||||
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
||||
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
||||
llama_build(test-tokenizer-1-bpe.cpp)
|
||||
|
||||
# TODO: disabled due to slowness
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
@@ -120,37 +131,35 @@ if (NOT WIN32)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
|
||||
# build test-tokenizer-1-spm target once and add many tests
|
||||
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
|
||||
target_link_libraries(test-tokenizer-1-spm PRIVATE common)
|
||||
install(TARGETS test-tokenizer-1-spm RUNTIME)
|
||||
llama_build(test-tokenizer-1-spm.cpp)
|
||||
|
||||
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||
|
||||
# llama_target_and_test(test-double-float.cpp) # SLOW
|
||||
# llama_build_and_test(test-double-float.cpp) # SLOW
|
||||
endif()
|
||||
|
||||
llama_target_and_test(test-log.cpp)
|
||||
llama_target_and_test(test-chat-template.cpp)
|
||||
llama_build_and_test(test-log.cpp)
|
||||
llama_build_and_test(test-chat-template.cpp)
|
||||
|
||||
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
|
||||
if (NOT WIN32)
|
||||
llama_target_and_test(test-arg-parser.cpp)
|
||||
llama_build_and_test(test-arg-parser.cpp)
|
||||
endif()
|
||||
|
||||
# llama_target_and_test(test-opt.cpp) # SLOW
|
||||
llama_target_and_test(test-gguf.cpp)
|
||||
llama_target_and_test(test-backend-ops.cpp)
|
||||
# llama_build_and_test(test-opt.cpp) # SLOW
|
||||
llama_build_and_test(test-gguf.cpp)
|
||||
llama_build_and_test(test-backend-ops.cpp)
|
||||
|
||||
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
||||
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||
llama_build_and_test(test-autorelease.cpp LABEL "model")
|
||||
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these tests use the backends directly and cannot be built with dynamic loading
|
||||
llama_target_and_test(test-barrier.cpp)
|
||||
llama_target_and_test(test-quantize-fns.cpp)
|
||||
llama_target_and_test(test-quantize-perf.cpp)
|
||||
llama_target_and_test(test-rope.cpp)
|
||||
llama_build_and_test(test-barrier.cpp)
|
||||
llama_build_and_test(test-quantize-fns.cpp)
|
||||
llama_build_and_test(test-quantize-perf.cpp)
|
||||
llama_build_and_test(test-rope.cpp)
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
@@ -126,6 +126,53 @@ int main(void) {
|
||||
assert(params.cpuparams.n_threads == 1010);
|
||||
#endif // _WIN32
|
||||
|
||||
if (common_has_curl()) {
|
||||
printf("test-arg-parser: test curl-related functions\n\n");
|
||||
const char * GOOD_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/README.md";
|
||||
const char * BAD_URL = "https://www.google.com/404";
|
||||
const char * BIG_FILE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin";
|
||||
|
||||
{
|
||||
printf("test-arg-parser: test good URL\n\n");
|
||||
auto res = common_remote_get_content(GOOD_URL, {});
|
||||
assert(res.first == 200);
|
||||
assert(res.second.size() > 0);
|
||||
std::string str(res.second.data(), res.second.size());
|
||||
assert(str.find("llama.cpp") != std::string::npos);
|
||||
}
|
||||
|
||||
{
|
||||
printf("test-arg-parser: test bad URL\n\n");
|
||||
auto res = common_remote_get_content(BAD_URL, {});
|
||||
assert(res.first == 404);
|
||||
}
|
||||
|
||||
{
|
||||
printf("test-arg-parser: test max size error\n");
|
||||
common_remote_params params;
|
||||
params.max_size = 1;
|
||||
try {
|
||||
common_remote_get_content(GOOD_URL, params);
|
||||
assert(false && "it should throw an error");
|
||||
} catch (std::exception & e) {
|
||||
printf(" expected error: %s\n\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
printf("test-arg-parser: test timeout error\n");
|
||||
common_remote_params params;
|
||||
params.timeout = 1;
|
||||
try {
|
||||
common_remote_get_content(BIG_FILE, params);
|
||||
assert(false && "it should throw an error");
|
||||
} catch (std::exception & e) {
|
||||
printf(" expected error: %s\n\n", e.what());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
printf("test-arg-parser: no curl, skipping curl-related functions\n");
|
||||
}
|
||||
|
||||
printf("test-arg-parser: all tests OK\n\n");
|
||||
}
|
||||
|
||||
@@ -2606,6 +2606,8 @@ struct test_rope : public test_case {
|
||||
} else {
|
||||
out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||
}
|
||||
|
||||
// TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
|
||||
}
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
|
||||
+3
-2
@@ -11,8 +11,9 @@
|
||||
#include <string>
|
||||
|
||||
#include "chat.h"
|
||||
#include "llama-grammar.h"
|
||||
#include "unicode.h"
|
||||
|
||||
#include "../src/unicode.h"
|
||||
#include "../src/llama-grammar.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "unicode.h"
|
||||
#include "llama-grammar.h"
|
||||
#include "../src/unicode.h"
|
||||
#include "../src/llama-grammar.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
@@ -2,10 +2,11 @@
|
||||
#undef NDEBUG
|
||||
#endif
|
||||
|
||||
#include "unicode.h"
|
||||
#include "llama-grammar.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
|
||||
#include "../src/unicode.h"
|
||||
#include "../src/llama-grammar.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# undef NDEBUG
|
||||
#endif
|
||||
|
||||
#include "unicode.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -84,7 +83,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
||||
|
||||
fprintf(stderr,
|
||||
"\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following "
|
||||
"command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf "
|
||||
"command: ./test-gbnf-validator test-grammar-integration.grammar.gbnf "
|
||||
"test-grammar-integration.string.txt\n\n");
|
||||
} else {
|
||||
fprintf(stdout, "✅︎\n");
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
#endif
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-grammar.h"
|
||||
|
||||
// TODO: shold not include libllama sources
|
||||
#include "../src/llama-grammar.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
#include "json-schema-to-grammar.h"
|
||||
|
||||
#include "llama-grammar.h"
|
||||
#include "../src/llama-grammar.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <fstream>
|
||||
@@ -597,6 +597,22 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
)"""
|
||||
});
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
"maxItems 0",
|
||||
R"""({
|
||||
"items": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"maxItems": 0
|
||||
})""",
|
||||
R"""(
|
||||
boolean ::= ("true" | "false") space
|
||||
root ::= "[" space "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
"maxItems 1",
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
#endif
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-grammar.h"
|
||||
|
||||
#include "../src/llama-grammar.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "llama-model.h"
|
||||
#include "common.h"
|
||||
|
||||
#include "../src/llama-model.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
@@ -1,8 +1,9 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "unicode.h"
|
||||
#include "console.h"
|
||||
|
||||
#include "../src/unicode.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <codecvt>
|
||||
#include <cstdio>
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "unicode.h"
|
||||
#include "console.h"
|
||||
|
||||
#include "../src/unicode.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <codecvt>
|
||||
#include <cstdio>
|
||||
|
||||
Reference in New Issue
Block a user