mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 01:57:43 +02:00
Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1d735c0b4f | |||
| 5368ddda7a | |||
| 84a9bf2fc2 | |||
| 2016f07bd1 | |||
| 6602304814 | |||
| 66168204be | |||
| 4ba9d711ba | |||
| 00137157fc | |||
| fb28f4f80e | |||
| 37b9f0d29d | |||
| 6408210082 | |||
| aff9d107b0 | |||
| 35370ba945 | |||
| 8d66005763 | |||
| b9154ecff9 | |||
| 2db9ba1464 | |||
| 2f74c354c0 |
@@ -601,8 +601,9 @@ jobs:
|
||||
-DGGML_SYCL_F16=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
build-linux-cross:
|
||||
uses: ./.github/workflows/build-linux-cross.yml
|
||||
# Disabled for now due to sporadic issue syncing.
|
||||
# build-linux-cross:
|
||||
# uses: ./.github/workflows/build-linux-cross.yml
|
||||
|
||||
macOS-latest-cmake-ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
@@ -16,6 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
|
||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
|
||||
+1
-1
@@ -2726,7 +2726,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.chat_template = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
||||
string_format(
|
||||
|
||||
+379
-287
File diff suppressed because it is too large
Load Diff
@@ -24,7 +24,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
import gguf
|
||||
|
||||
# reuse model definitions from convert_hf_to_gguf.py
|
||||
from convert_hf_to_gguf import LazyTorchTensor, Model
|
||||
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
|
||||
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
@@ -340,11 +340,11 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
hparams = ModelBase.load_hparams(dir_base_model)
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -61,19 +61,9 @@ if(TARGET BUILD_INFO)
|
||||
add_dependencies(mtmd BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET llama-llava-cli)
|
||||
add_executable(${TARGET} llava-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-minicpmv-cli)
|
||||
add_executable(${TARGET} minicpmv-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
|
||||
set(TARGET llama-qwen2vl-cli)
|
||||
add_executable(${TARGET} qwen2vl-cli.cpp)
|
||||
@@ -82,9 +72,9 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-gemma3-cli)
|
||||
add_executable(${TARGET} gemma3-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable(${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -50,7 +50,6 @@
|
||||
// tensor name constants
|
||||
//
|
||||
|
||||
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||
#define TN_CLASS_EMBD "v.class_embd"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||
@@ -66,8 +65,6 @@
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_TEXT_PROJ "text_projection.weight"
|
||||
#define TN_VIS_PROJ "visual_projection.weight"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
|
||||
+464
-458
File diff suppressed because it is too large
Load Diff
@@ -30,12 +30,13 @@ struct clip_image_size {
|
||||
int height;
|
||||
};
|
||||
|
||||
struct clip_image_f32;
|
||||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
ggml_log_level verbosity;
|
||||
enum ggml_log_level verbosity;
|
||||
};
|
||||
|
||||
// deprecated, use clip_init
|
||||
@@ -84,7 +85,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
||||
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
||||
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
||||
CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
|
||||
/**
|
||||
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::string filename = "main";
|
||||
if (argc >= 1) {
|
||||
filename = argv[0];
|
||||
}
|
||||
|
||||
// Get only the program name from the full path
|
||||
size_t pos = filename.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
filename = filename.substr(pos+1);
|
||||
}
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
||||
fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
@@ -1,332 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "base64.hpp"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * sample(struct common_sampler * smpl,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx_llama);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
static std::string ret;
|
||||
if (llama_vocab_is_eog(vocab, id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = common_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
||||
static const char* IMG_BASE64_TAG_END = "\">";
|
||||
|
||||
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
||||
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
||||
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
||||
}
|
||||
|
||||
static bool prompt_contains_image(const std::string& prompt) {
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
return (begin != std::string::npos);
|
||||
}
|
||||
|
||||
// replaces the base64 image tag in the prompt with `replacement`
|
||||
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
|
||||
size_t img_base64_str_start, img_base64_str_end;
|
||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
||||
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
||||
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
|
||||
|
||||
auto required_bytes = base64::required_encode_size(base64_str.size());
|
||||
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||
|
||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||
if (!embed) {
|
||||
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
if (begin == std::string::npos || end == std::string::npos) {
|
||||
return prompt;
|
||||
}
|
||||
auto pre = prompt.substr(0, begin);
|
||||
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
||||
return pre + replacement + post;
|
||||
}
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG("\n example usage:\n");
|
||||
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
|
||||
|
||||
// load and preprocess the image
|
||||
llava_image_embed * embed = NULL;
|
||||
auto prompt = params->prompt;
|
||||
if (prompt_contains_image(prompt)) {
|
||||
if (!params->image.empty()) {
|
||||
LOG_INF("using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||
if (!embed) {
|
||||
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
params->prompt = remove_image_from_prompt(prompt);
|
||||
} else {
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
|
||||
int n_past = 0;
|
||||
|
||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
|
||||
std::string system_prompt, user_prompt;
|
||||
size_t image_pos = prompt.find("<image>");
|
||||
if (image_pos != std::string::npos) {
|
||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// llava-1.5 native mode
|
||||
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
|
||||
user_prompt = prompt + "\nASSISTANT:";
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
if (!smpl) {
|
||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::string response = "";
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
LOG("%s", tmp);
|
||||
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
common_sampler_free(smpl);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(common_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
|
||||
|
||||
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||
|
||||
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_model_free(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * model = llava_init(¶ms);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (prompt_contains_image(params.prompt)) {
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
} else {
|
||||
for (auto & image : params.image) {
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
}
|
||||
|
||||
llama_model_free(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,354 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <iostream> // TODO: remove me
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(common_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||
if (params->n_ctx < 2048) {
|
||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||
ctx_params.n_ctx = 2048;
|
||||
} else {
|
||||
ctx_params.n_ctx = params->n_ctx;
|
||||
}
|
||||
|
||||
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_model_free(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
static struct clip_ctx * clip_init_context(common_params * params) {
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
struct clip_context_params clip_params = {
|
||||
/* use_gpu */ params->n_gpu_layers != 0,
|
||||
/* verbosity */ GGML_LOG_LEVEL_INFO, // TODO: make this configurable
|
||||
};
|
||||
auto * ctx_clip = clip_init(clip_path, clip_params);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||
}
|
||||
|
||||
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
|
||||
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
|
||||
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||
slice_embed->embed = image_embed;
|
||||
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||
llava_image_embed_free(slice_embed);
|
||||
}
|
||||
|
||||
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
|
||||
std::string system_prompt;
|
||||
int idx = 0;
|
||||
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (num_image_embeds > 1) {
|
||||
if (has_minicpmv_projector == 2) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
}
|
||||
|
||||
static const char * sample(struct common_sampler * smpl,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx_llama);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
static std::string ret;
|
||||
if (llama_vocab_is_eog(vocab, id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = common_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
|
||||
auto * ctx_clip = clip_init_context(params);
|
||||
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
if (params->prompt.empty() && params->interactive == false) {
|
||||
LOG_ERR("prompt should be given or interactive mode should be on");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * model = llava_init(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||
auto * ctx_llava = llava_init_context(params, model);
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
|
||||
const int64_t t_process_image_start_us = ggml_time_us();
|
||||
process_image(ctx_llava, embeds, params, n_past);
|
||||
const int64_t t_process_image_end_us = ggml_time_us();
|
||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
|
||||
llava_image_embed_free(embeds);
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
||||
std::string user_prompt = prompt;
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (!is_first) {
|
||||
if (has_minicpmv_projector == 2) {
|
||||
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG_INF("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
return smpl;
|
||||
}
|
||||
|
||||
static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
|
||||
|
||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.path.empty() || (params.image.empty())) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (auto & image : params.image) {
|
||||
int n_past = 0;
|
||||
auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||
|
||||
if (!params.prompt.empty()) {
|
||||
LOG("<user>%s\n", params.prompt.c_str());
|
||||
LOG("<assistant>");
|
||||
auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response;
|
||||
bool have_tmp = false;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0){
|
||||
if (!have_tmp) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
have_tmp = true;
|
||||
printf("%s", tmp);
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
common_sampler_free(smpl);
|
||||
}else {
|
||||
while (true) {
|
||||
LOG("<user>");
|
||||
std::string prompt;
|
||||
std::getline(std::cin, prompt);
|
||||
LOG("<assistant>");
|
||||
auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
printf("%s", tmp);// mistral llava-1.6
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
fflush(stdout);
|
||||
}
|
||||
common_sampler_free(smpl);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -28,15 +28,16 @@ static bool g_is_generating = false;
|
||||
|
||||
/**
|
||||
* Please note that this is NOT a production-ready stuff.
|
||||
* It is a playground for trying Gemma 3 vision capabilities.
|
||||
* It is a playground for trying multimodal support in llama.cpp.
|
||||
* For contributors: please keep this code simple and easy to understand.
|
||||
*/
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG(
|
||||
"Experimental CLI for using Gemma 3 vision model\n\n"
|
||||
"Experimental CLI for multimodal\n\n"
|
||||
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
||||
" -m and --mmproj are required\n"
|
||||
" -hf user/repo can replace both -m and --mmproj in most cases\n"
|
||||
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
||||
argv[0]
|
||||
);
|
||||
@@ -56,7 +57,7 @@ static void sigint_handler(int signo) {
|
||||
}
|
||||
#endif
|
||||
|
||||
struct gemma3_context {
|
||||
struct mtmd_cli_context {
|
||||
mtmd_context_ptr ctx_vision;
|
||||
common_init_result llama_init;
|
||||
|
||||
@@ -70,18 +71,38 @@ struct gemma3_context {
|
||||
// so here we don't need to keep track of chat history
|
||||
common_chat_templates_ptr tmpls;
|
||||
|
||||
// support for legacy templates (models not having EOT token)
|
||||
llama_tokens antiprompt_tokens;
|
||||
|
||||
int n_threads = 1;
|
||||
llama_pos n_past = 0;
|
||||
|
||||
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init.model.get();
|
||||
lctx = llama_init.context.get();
|
||||
vocab = llama_model_get_vocab(model);
|
||||
n_threads = params.cpuparams.n_threads;
|
||||
batch = llama_batch_init(params.n_batch, 0, 1);
|
||||
n_batch = params.n_batch;
|
||||
|
||||
if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
|
||||
LOG_ERR("Model does not have chat template.\n");
|
||||
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
|
||||
LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
tmpls = common_chat_templates_init(model, params.chat_template);
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja).c_str());
|
||||
|
||||
init_vision_context(params);
|
||||
|
||||
// load antiprompt tokens for legacy templates
|
||||
if (params.chat_template == "vicuna") {
|
||||
antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
|
||||
} else if (params.chat_template == "deepseek") {
|
||||
antiprompt_tokens = common_tokenize(lctx, "###", false, true);
|
||||
}
|
||||
}
|
||||
|
||||
void init_vision_context(common_params & params) {
|
||||
@@ -97,6 +118,17 @@ struct gemma3_context {
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
bool check_antiprompt(const llama_tokens & generated_tokens) {
|
||||
if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(
|
||||
generated_tokens.end() - antiprompt_tokens.size(),
|
||||
generated_tokens.end(),
|
||||
antiprompt_tokens.begin()
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
struct decode_embd_batch {
|
||||
@@ -132,7 +164,8 @@ struct decode_embd_batch {
|
||||
}
|
||||
};
|
||||
|
||||
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
|
||||
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
|
||||
llama_tokens generated_tokens;
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
if (i > n_predict || !g_is_generating) {
|
||||
printf("\n");
|
||||
@@ -140,9 +173,10 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
|
||||
}
|
||||
|
||||
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
|
||||
generated_tokens.push_back(token_id);
|
||||
common_sampler_accept(smpl, token_id, true);
|
||||
|
||||
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
|
||||
if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
|
||||
printf("\n");
|
||||
break; // end of generation
|
||||
}
|
||||
@@ -161,7 +195,7 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
|
||||
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
|
||||
std::vector<mtmd_bitmap> bitmaps;
|
||||
|
||||
common_chat_templates_inputs tmpl_inputs;
|
||||
@@ -184,18 +218,19 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
|
||||
text.text = formatted_chat.prompt;
|
||||
text.add_special = add_bos;
|
||||
text.parse_special = true;
|
||||
mtmd_input_chunks_ptr chunks(mtmd_tokenize(ctx.ctx_vision.get(), text, bitmaps));
|
||||
if (chunks == nullptr) {
|
||||
LOG_ERR("Unable to tokenize prompt\n");
|
||||
mtmd_input_chunks chunks;
|
||||
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
|
||||
if (res != 0) {
|
||||
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks.get(), ctx.n_past, 0, ctx.n_batch)) {
|
||||
if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
|
||||
LOG_ERR("Unable to eval prompt\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx.n_past += mtmd_helper_get_n_tokens(chunks.get());
|
||||
ctx.n_past += mtmd_helper_get_n_tokens(chunks);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -217,7 +252,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gemma3_context ctx(params);
|
||||
mtmd_cli_context ctx(params);
|
||||
printf("%s: %s\n", __func__, params.model.path.c_str());
|
||||
|
||||
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||
+319
-74
@@ -12,19 +12,43 @@
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
|
||||
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
|
||||
enum mtmd_slice_tmpl {
|
||||
MTMD_SLICE_TMPL_NONE,
|
||||
MTMD_SLICE_TMPL_MINICPMV_2_5,
|
||||
MTMD_SLICE_TMPL_MINICPMV_2_6,
|
||||
// TODO @ngxson : add support for idefics (SmolVLM)
|
||||
};
|
||||
|
||||
struct mtmd_context {
|
||||
struct clip_ctx * ctx_clip;
|
||||
const struct llama_model * text_model;
|
||||
std::vector<float> image_embd_v; // image embedding vector
|
||||
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
std::string image_marker;
|
||||
|
||||
// for minicpmv, we need special tokens in-between slices
|
||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
||||
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
|
||||
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
|
||||
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
|
||||
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
|
||||
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
|
||||
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
|
||||
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
mtmd_context(const char * mmproj_fname,
|
||||
const llama_model * text_model,
|
||||
const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
|
||||
const mtmd_context_params & ctx_params) :
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
image_marker (ctx_params.image_marker)
|
||||
{
|
||||
clip_context_params ctx_clip_params;
|
||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
||||
@@ -33,11 +57,66 @@ struct mtmd_context {
|
||||
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
||||
}
|
||||
this->text_model = text_model;
|
||||
|
||||
GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
|
||||
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
||||
tok_ov_img_start = lookup_token("<image>");
|
||||
tok_ov_img_end = lookup_token("</image>");
|
||||
tok_slices_start = lookup_token("<slice>");
|
||||
tok_slices_end = lookup_token("</slice>");
|
||||
tok_sli_img_start = tok_ov_img_start;
|
||||
tok_sli_img_end = tok_ov_img_end;
|
||||
tok_row_end = lookup_token("\n");
|
||||
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||
tok_ov_img_start = lookup_token("<image>");
|
||||
tok_ov_img_end = lookup_token("</image>");
|
||||
tok_sli_img_start = lookup_token("<slice>");
|
||||
tok_sli_img_end = lookup_token("</slice>");
|
||||
tok_row_end = lookup_token("\n");
|
||||
|
||||
} else if (minicpmv_version != 0) {
|
||||
GGML_ASSERT(false && "unsupported minicpmv version");
|
||||
}
|
||||
}
|
||||
|
||||
~mtmd_context() {
|
||||
clip_free(ctx_clip);
|
||||
}
|
||||
|
||||
private:
|
||||
llama_token lookup_token(const std::string & token_text) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(text_model);
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
if (token_to_piece(vocab, i, true) == token_text) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
|
||||
std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
|
||||
std::string piece;
|
||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
||||
if (n_chars < 0) {
|
||||
piece.resize(-n_chars);
|
||||
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
||||
GGML_ASSERT(check == -n_chars);
|
||||
} else {
|
||||
piece.resize(n_chars);
|
||||
}
|
||||
return piece;
|
||||
}
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens_data {
|
||||
@@ -49,6 +128,7 @@ struct mtmd_image_tokens {
|
||||
uint32_t ny; // number of tokens in y direction
|
||||
uint32_t n_tokens() const { return nx * ny; }
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
};
|
||||
|
||||
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
||||
@@ -88,29 +168,66 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
|
||||
return result;
|
||||
}
|
||||
|
||||
mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
|
||||
const mtmd_input_text & text,
|
||||
const std::vector<mtmd_bitmap> & bitmaps) {
|
||||
mtmd_input_chunks * output = new mtmd_input_chunks;
|
||||
int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
std::vector<mtmd_input_chunk> & output,
|
||||
const mtmd_input_text & text,
|
||||
const std::vector<mtmd_bitmap> & bitmaps) {
|
||||
auto vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
std::string prompt_modified(text.text);
|
||||
std::string marker_modified(ctx->image_marker);
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
// a bit hacky here, but works for now
|
||||
// for some models, we need to add prefix and suffix to the image embeddings
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
if (clip_is_gemma3(ctx->ctx_clip)) {
|
||||
// gemma 3
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
|
||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||
}
|
||||
|
||||
std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
|
||||
output->clear();
|
||||
output->reserve(parts.size());
|
||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
||||
// for glm-edge, we don't need to add because the tokens are already in the returned embeddings
|
||||
|
||||
// TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens
|
||||
|
||||
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
|
||||
output.clear();
|
||||
output.reserve(parts.size());
|
||||
|
||||
size_t i_img = 0;
|
||||
|
||||
// utility for adding raw tokens
|
||||
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
std::move(tokens),
|
||||
{},
|
||||
};
|
||||
output.emplace_back(std::move(chunk));
|
||||
};
|
||||
|
||||
// utility for splitting batch of multiple images into chunks of batch having single images
|
||||
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
|
||||
std::vector<mtmd_input_chunk> chunks;
|
||||
|
||||
for (auto & entry : batch_f32.entries) {
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = clip_n_patches(ctx->ctx_clip);
|
||||
image_tokens->ny = 1;
|
||||
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
||||
image_tokens->id = id;
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{},
|
||||
std::move(image_tokens),
|
||||
};
|
||||
chunks.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
return chunks;
|
||||
};
|
||||
|
||||
for (const auto & part : parts) {
|
||||
//printf("tokenizing part: %s\n", part.c_str());
|
||||
bool add_bos = &parts.front() == ∂
|
||||
@@ -123,66 +240,156 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
|
||||
std::move(tokens),
|
||||
{},
|
||||
};
|
||||
output->emplace_back(std::move(chunk));
|
||||
output.emplace_back(std::move(chunk));
|
||||
|
||||
if (&parts.back() != &part) {
|
||||
// add image token to middle of 2 parts
|
||||
|
||||
if (i_img >= bitmaps.size()) {
|
||||
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
||||
return nullptr;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// shim layer
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmaps[i_img].nx;
|
||||
img_u8->ny = bitmaps[i_img].ny;
|
||||
img_u8->buf.resize(bitmaps[i_img].data.size());
|
||||
std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
|
||||
|
||||
// preprocess image
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return nullptr;
|
||||
return 2;
|
||||
}
|
||||
|
||||
mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
|
||||
image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
|
||||
image_tokens->ny = 1; // TODO
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
|
||||
// split batch into chunks of single images
|
||||
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
|
||||
GGML_ASSERT(chunks.size() > 0);
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{},
|
||||
image_tokens,
|
||||
};
|
||||
output->emplace_back(std::move(chunk));
|
||||
i_img++;
|
||||
// add overview image
|
||||
add_text_chunk({ctx->tok_ov_img_start});
|
||||
output.emplace_back(std::move(chunks.front()));
|
||||
chunks.erase(chunks.begin());
|
||||
add_text_chunk({ctx->tok_ov_img_end});
|
||||
|
||||
// add slices
|
||||
if (!chunks.empty()) {
|
||||
clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
|
||||
int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
|
||||
int n_row = (int)chunks.size() / n_col;
|
||||
GGML_ASSERT(n_row * n_col == (int)chunks.size());
|
||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_slices_start});
|
||||
}
|
||||
for (int y = 0; y < n_row; y++) {
|
||||
for (int x = 0; x < n_col; x++) {
|
||||
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_start});
|
||||
}
|
||||
output.emplace_back(std::move(chunks[y * n_col + x]));
|
||||
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_end});
|
||||
}
|
||||
}
|
||||
if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
|
||||
add_text_chunk({ctx->tok_row_end});
|
||||
}
|
||||
}
|
||||
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_slices_end});
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = clip_n_patches(ctx->ctx_clip) * batch_f32.entries.size(); // TODO @ngxson : use clip_n_patches_by_image
|
||||
image_tokens->ny = 1; // TODO
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmaps[i_img].id; // optional
|
||||
|
||||
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
||||
|
||||
if (clip_is_glm(ctx->ctx_clip)) {
|
||||
// glm-edge
|
||||
image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
|
||||
}
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{},
|
||||
std::move(image_tokens),
|
||||
};
|
||||
output.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
i_img++; // move to next image
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
|
||||
for (auto & chunk : *chunks) {
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
|
||||
delete chunk.tokens_image;
|
||||
}
|
||||
void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
|
||||
if (image_tokens) {
|
||||
delete image_tokens;
|
||||
}
|
||||
delete chunks;
|
||||
}
|
||||
|
||||
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->n_tokens();
|
||||
}
|
||||
|
||||
size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->nx;
|
||||
}
|
||||
|
||||
size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->ny;
|
||||
}
|
||||
|
||||
std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->id;
|
||||
}
|
||||
|
||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
bool ok = false;
|
||||
|
||||
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
|
||||
{
|
||||
clip_image_size slice_size{
|
||||
image_tokens->batch_f32.entries[0]->nx,
|
||||
image_tokens->batch_f32.entries[0]->ny};
|
||||
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
|
||||
}
|
||||
|
||||
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
int n_tokens_per_image = clip_n_patches(ctx->ctx_clip);
|
||||
ok = clip_image_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->n_threads,
|
||||
entries[i].get(),
|
||||
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
|
||||
}
|
||||
} else {
|
||||
ok = clip_image_batch_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
}
|
||||
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -190,9 +397,9 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||
return ctx->image_embd_v.data();
|
||||
}
|
||||
|
||||
size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
|
||||
size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
|
||||
size_t n_tokens = 0;
|
||||
for (auto & chunk : *chunks) {
|
||||
for (auto & chunk : chunks) {
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
n_tokens += chunk.tokens_text.size();
|
||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
@@ -241,35 +448,38 @@ struct decode_embd_batch {
|
||||
|
||||
int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
llama_context * lctx,
|
||||
mtmd_input_chunks * chunks,
|
||||
mtmd_input_chunks & chunks,
|
||||
llama_pos pos0,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch) {
|
||||
int32_t ret;
|
||||
llama_pos n_past = pos0;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
|
||||
for (auto & chunk : *chunks) {
|
||||
bool is_last = &chunk == &chunks->back();
|
||||
for (auto & chunk : chunks) {
|
||||
bool is_last = &chunk == &chunks.back();
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
// TODO @ngxson : may need to split into smaller batches
|
||||
text_batch.n_tokens = chunk.tokens_text.size();
|
||||
for (size_t i = 0; i < chunk.tokens_text.size(); i++) {
|
||||
text_batch.token [i] = chunk.tokens_text[i];
|
||||
text_batch.pos [i] = n_past++;
|
||||
text_batch.n_seq_id[i] = 1;
|
||||
text_batch.seq_id [i][0] = seq_id;
|
||||
text_batch.logits [i] = false;
|
||||
}
|
||||
if (is_last) {
|
||||
// always get logits for last input chunk
|
||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
||||
}
|
||||
ret = llama_decode(lctx, text_batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode text\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
size_t i = 0;
|
||||
while (i < chunk.tokens_text.size()) { // split into batches
|
||||
for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
|
||||
text_batch.token [i] = chunk.tokens_text[i];
|
||||
text_batch.pos [i] = n_past++;
|
||||
text_batch.n_seq_id[i] = 1;
|
||||
text_batch.seq_id [i][0] = seq_id;
|
||||
text_batch.logits [i] = false;
|
||||
}
|
||||
if (is_last) {
|
||||
// always get logits for last input chunk
|
||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
||||
}
|
||||
ret = llama_decode(lctx, text_batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode text\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
@@ -277,33 +487,56 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
GGML_ASSERT(chunk.tokens_image != nullptr);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("encoding image...\n");
|
||||
LOG_INF("encoding image or slice...\n");
|
||||
}
|
||||
ret = mtmd_encode(ctx, chunk.tokens_image);
|
||||
ret = mtmd_encode(ctx, chunk.tokens_image.get());
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to encode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
}
|
||||
|
||||
int32_t n_tokens = chunk.tokens_image->n_tokens();
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
ret = llama_decode(lctx, batch_img.batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, false);
|
||||
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
|
||||
}
|
||||
|
||||
n_past += n_tokens;
|
||||
while (i_batch < n_img_batches) { // split into batches
|
||||
int32_t pos_offset = i_batch*n_batch;
|
||||
int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||
float * embd_batch = embd + pos_offset*n_mmproj_embd;
|
||||
decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
|
||||
|
||||
printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
ret = llama_decode(lctx, batch_img.batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
||||
}
|
||||
|
||||
i_batch++;
|
||||
n_past += n_tokens_batch;
|
||||
}
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
@@ -339,3 +572,15 @@ int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & outp
|
||||
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
||||
mtmd_image_tokens_free(val);
|
||||
}
|
||||
|
||||
+26
-11
@@ -39,12 +39,18 @@ struct mtmd_bitmap {
|
||||
uint32_t nx;
|
||||
uint32_t ny;
|
||||
std::vector<unsigned char> data;
|
||||
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens_deleter {
|
||||
void operator()(mtmd_image_tokens * val); // forward declaration
|
||||
};
|
||||
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
|
||||
|
||||
struct mtmd_input_chunk {
|
||||
mtmd_input_chunk_type type;
|
||||
std::vector<llama_token> tokens_text;
|
||||
mtmd_image_tokens * tokens_image = nullptr;
|
||||
mtmd_image_tokens_ptr tokens_image;
|
||||
};
|
||||
|
||||
using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
|
||||
@@ -82,12 +88,21 @@ MTMD_API void mtmd_free(mtmd_context * ctx);
|
||||
// 3. "<end_of_image>\ndescribe it in detail."
|
||||
// number of bitmaps must be equal to the number of image markers in the prompt
|
||||
// this function is thread-safe (shared ctx)
|
||||
MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
|
||||
// return values:
|
||||
// 0 on success
|
||||
// 1 on number of images not matching the number of markers
|
||||
// 2 on image preprocessing error
|
||||
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
std::vector<mtmd_input_chunk> & output,
|
||||
const mtmd_input_text & text,
|
||||
const std::vector<mtmd_bitmap> & bitmaps);
|
||||
|
||||
// free image chunk data
|
||||
MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
|
||||
// access mtmd_image_tokens
|
||||
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
|
||||
|
||||
// returns 0 on success
|
||||
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
||||
@@ -96,12 +111,17 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
||||
// get output embeddings from the last encode pass
|
||||
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||
|
||||
// whether we need to set non-causal mask before llama_decode
|
||||
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
||||
|
||||
|
||||
|
||||
//
|
||||
// helper functions (can be implemented based on other functions)
|
||||
//
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
@@ -110,7 +130,7 @@ MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
|
||||
// otherwise, returns 0 on success
|
||||
MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
llama_context * lctx,
|
||||
mtmd_input_chunks * chunks,
|
||||
mtmd_input_chunks & chunks,
|
||||
llama_pos pos0,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch);
|
||||
@@ -132,11 +152,6 @@ struct mtmd_context_deleter {
|
||||
};
|
||||
using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
||||
|
||||
struct mtmd_input_chunks_deleter {
|
||||
void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
|
||||
};
|
||||
using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
|
||||
|
||||
#else
|
||||
|
||||
static_assert(false && "C header is not yet supported by this library");
|
||||
|
||||
+23
-11
@@ -17,26 +17,30 @@ cd $PROJ_ROOT
|
||||
|
||||
arr_bin=()
|
||||
arr_hf=()
|
||||
arr_tmpl=() # chat template
|
||||
|
||||
add_test() {
|
||||
local bin=$1
|
||||
local hf=$2
|
||||
local tmpl=${3:-""} # default to empty string if not provided
|
||||
arr_bin+=("$bin")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("$tmpl")
|
||||
}
|
||||
|
||||
add_test "llama-gemma3-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test "llama-llava-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
add_test "llama-llava-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M"
|
||||
add_test "llama-llava-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test "llama-llava-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K"
|
||||
add_test "llama-llava-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"
|
||||
add_test "llama-llava-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test "llama-minicpmv-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K" "vicuna"
|
||||
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
|
||||
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" # this model has broken chat template, not usable
|
||||
|
||||
###############
|
||||
|
||||
cmake --build build -j --target "${arr_bin[@]}"
|
||||
@@ -46,12 +50,20 @@ arr_res=()
|
||||
for i in "${!arr_bin[@]}"; do
|
||||
bin="${arr_bin[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
tmpl="${arr_tmpl[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
output=$("$PROJ_ROOT/build/bin/$bin" -hf "$hf" --image $SCRIPT_DIR/test-1.jpeg -p "what is the publisher name of the newspaper?" --temp 0 2>&1 | tee /dev/tty)
|
||||
output=$(\
|
||||
"$PROJ_ROOT/build/bin/$bin" \
|
||||
-hf "$hf" \
|
||||
--image $SCRIPT_DIR/test-1.jpeg \
|
||||
-p "what is the publisher name of the newspaper?" \
|
||||
--temp 0 -n 128 \
|
||||
${tmpl:+--chat-template "$tmpl"} \
|
||||
2>&1 | tee /dev/tty)
|
||||
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
|
||||
+16
-5
@@ -865,9 +865,22 @@ int main(int argc, char ** argv) {
|
||||
console::set_display(console::reset);
|
||||
display = true;
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
if (buffer.empty()) { // Ctrl+D on empty line exits
|
||||
LOG("EOF by user\n");
|
||||
break;
|
||||
}
|
||||
|
||||
if (buffer.back() == '\n') {
|
||||
// Implement #587:
|
||||
// If the user wants the text to end in a newline,
|
||||
// this should be accomplished by explicitly adding a newline by using \ followed by return,
|
||||
// then returning control by pressing return again.
|
||||
buffer.pop_back();
|
||||
}
|
||||
|
||||
if (buffer.empty()) { // Enter key on empty line lets the user pass control back
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
} else { // Add tokens to embd only if the input buffer is non-empty
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty() && !params.conversation_mode) {
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
@@ -915,8 +928,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
|
||||
@@ -297,7 +297,10 @@ int main(int argc, char * argv[]) {
|
||||
}
|
||||
cache_dir = cache_dir_str.c_str();
|
||||
}
|
||||
printf("Starting RPC server\n");
|
||||
printf("Starting RPC server v%d.%d.%d\n",
|
||||
RPC_PROTO_MAJOR_VERSION,
|
||||
RPC_PROTO_MINOR_VERSION,
|
||||
RPC_PROTO_PATCH_VERSION);
|
||||
printf(" endpoint : %s\n", endpoint.c_str());
|
||||
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
|
||||
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
|
||||
|
||||
+132
-113
@@ -1552,29 +1552,30 @@ struct server_queue {
|
||||
std::condition_variable condition_tasks;
|
||||
|
||||
// callback functions
|
||||
std::function<void(server_task)> callback_new_task;
|
||||
std::function<void(void)> callback_update_slots;
|
||||
std::function<void(server_task &&)> callback_new_task;
|
||||
std::function<void(void)> callback_update_slots;
|
||||
|
||||
// Add a new task to the end of the queue
|
||||
int post(server_task task, bool front = false) {
|
||||
int post(server_task && task, bool front = false) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
GGML_ASSERT(task.id != -1);
|
||||
// if this is cancel task make sure to clean up pending tasks
|
||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||
cleanup_pending_task(task.id_target);
|
||||
}
|
||||
QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
|
||||
const int task_id = task.id;
|
||||
QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
|
||||
if (front) {
|
||||
queue_tasks.push_front(std::move(task));
|
||||
} else {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
condition_tasks.notify_one();
|
||||
return task.id;
|
||||
return task_id;
|
||||
}
|
||||
|
||||
// multi-task version of post()
|
||||
int post(std::vector<server_task> & tasks, bool front = false) {
|
||||
int post(std::vector<server_task> && tasks, bool front = false) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
for (auto & task : tasks) {
|
||||
if (task.id == -1) {
|
||||
@@ -1596,7 +1597,7 @@ struct server_queue {
|
||||
}
|
||||
|
||||
// Add a new task, but defer until one slot is available
|
||||
void defer(server_task task) {
|
||||
void defer(server_task && task) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
QUE_DBG("defer task, id = %d\n", task.id);
|
||||
queue_tasks_deferred.push_back(std::move(task));
|
||||
@@ -1611,7 +1612,7 @@ struct server_queue {
|
||||
}
|
||||
|
||||
// Register function to process a new task
|
||||
void on_new_task(std::function<void(server_task)> callback) {
|
||||
void on_new_task(std::function<void(server_task &&)> callback) {
|
||||
callback_new_task = std::move(callback);
|
||||
}
|
||||
|
||||
@@ -1660,7 +1661,7 @@ struct server_queue {
|
||||
lock.unlock();
|
||||
break;
|
||||
}
|
||||
server_task task = queue_tasks.front();
|
||||
server_task task = std::move(queue_tasks.front());
|
||||
queue_tasks.pop_front();
|
||||
lock.unlock();
|
||||
|
||||
@@ -2004,7 +2005,7 @@ struct server_context {
|
||||
|
||||
slot.reset();
|
||||
|
||||
slots.push_back(slot);
|
||||
slots.push_back(std::move(slot));
|
||||
}
|
||||
|
||||
default_generation_settings_for_props = slots[0].to_json();
|
||||
@@ -2105,7 +2106,7 @@ struct server_context {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
||||
bool launch_slot_with_task(server_slot & slot, server_task && task) {
|
||||
slot.reset();
|
||||
slot.id_task = task.id;
|
||||
slot.index = task.index;
|
||||
@@ -2113,10 +2114,10 @@ struct server_context {
|
||||
slot.params = std::move(task.params);
|
||||
slot.prompt_tokens = std::move(task.prompt_tokens);
|
||||
|
||||
if (!are_lora_equal(task.params.lora, slot.lora)) {
|
||||
if (!are_lora_equal(slot.params.lora, slot.lora)) {
|
||||
// if lora is changed, we cannot reuse cached tokens
|
||||
slot.cache_tokens.clear();
|
||||
slot.lora = task.params.lora;
|
||||
slot.lora = slot.params.lora;
|
||||
}
|
||||
|
||||
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
|
||||
@@ -2547,10 +2548,10 @@ struct server_context {
|
||||
server_task task(SERVER_TASK_TYPE_CANCEL);
|
||||
task.id_target = id_task;
|
||||
queue_results.remove_waiting_task_id(id_task);
|
||||
cancel_tasks.push_back(task);
|
||||
cancel_tasks.push_back(std::move(task));
|
||||
}
|
||||
// push to beginning of the queue, so it has highest priority
|
||||
queue_tasks.post(cancel_tasks, true);
|
||||
queue_tasks.post(std::move(cancel_tasks), true);
|
||||
}
|
||||
|
||||
// receive the results from task(s)
|
||||
@@ -2637,7 +2638,7 @@ struct server_context {
|
||||
// Functions to process the task
|
||||
//
|
||||
|
||||
void process_single_task(server_task task) {
|
||||
void process_single_task(server_task && task) {
|
||||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
case SERVER_TASK_TYPE_INFILL:
|
||||
@@ -2651,17 +2652,17 @@ struct server_context {
|
||||
if (slot == nullptr) {
|
||||
// if no slot is available, we defer this task for processing later
|
||||
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
if (!launch_slot_with_task(*slot, task)) {
|
||||
if (!launch_slot_with_task(*slot, std::move(task))) {
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
|
||||
break;
|
||||
}
|
||||
@@ -2740,7 +2741,7 @@ struct server_context {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2776,7 +2777,7 @@ struct server_context {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2819,7 +2820,7 @@ struct server_context {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2871,7 +2872,7 @@ struct server_context {
|
||||
|
||||
server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
|
||||
task.id = queue_tasks.get_new_id();
|
||||
queue_tasks.post(task);
|
||||
queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
// apply context-shift if needed
|
||||
@@ -3633,14 +3634,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// request slots data using task queue
|
||||
server_task task(SERVER_TASK_TYPE_METRICS);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task, true); // high-priority task
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_METRICS);
|
||||
task.id = task_id;
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
||||
}
|
||||
|
||||
// get the result
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3669,16 +3673,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// request slots data using task queue
|
||||
server_task task(SERVER_TASK_TYPE_METRICS);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.metrics_reset_bucket = true;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task, true); // high-priority task
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_METRICS);
|
||||
task.id = task_id;
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
||||
}
|
||||
|
||||
// get the result
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3775,17 +3780,20 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
std::string filepath = params.slot_save_path + filename;
|
||||
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
|
||||
task.id = task_id;
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3804,17 +3812,20 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
std::string filepath = params.slot_save_path + filename;
|
||||
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
|
||||
task.id = task_id;
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3826,15 +3837,18 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
|
||||
task.id = task_id;
|
||||
task.slot_action.slot_id = id_slot;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3938,9 +3952,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
auto completion_id = gen_chatcmplid();
|
||||
std::vector<server_task> tasks;
|
||||
|
||||
std::unordered_set<int> task_ids;
|
||||
try {
|
||||
std::vector<server_task> tasks;
|
||||
|
||||
const auto & prompt = data.at("prompt");
|
||||
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
||||
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
||||
@@ -3955,9 +3970,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
||||
task.params = server_task::params_from_json_cmpl(
|
||||
ctx_server.ctx,
|
||||
ctx_server.params_base,
|
||||
data);
|
||||
ctx_server.ctx,
|
||||
ctx_server.params_base,
|
||||
data);
|
||||
task.id_selected_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
// OAI-compat
|
||||
@@ -3965,18 +3980,18 @@ int main(int argc, char ** argv) {
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
|
||||
tasks.push_back(task);
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
task_ids = server_task::get_list_id(tasks);
|
||||
ctx_server.queue_results.add_waiting_tasks(tasks);
|
||||
ctx_server.queue_tasks.post(std::move(tasks));
|
||||
} catch (const std::exception & e) {
|
||||
res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
|
||||
ctx_server.queue_results.add_waiting_tasks(tasks);
|
||||
ctx_server.queue_tasks.post(tasks);
|
||||
|
||||
bool stream = json_value(data, "stream", false);
|
||||
const auto task_ids = server_task::get_list_id(tasks);
|
||||
|
||||
if (!stream) {
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
@@ -4268,6 +4283,7 @@ int main(int argc, char ** argv) {
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
bool error = false;
|
||||
std::unordered_set<int> task_ids;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
||||
@@ -4280,28 +4296,27 @@ int main(int argc, char ** argv) {
|
||||
// OAI-compat
|
||||
task.params.oaicompat = oaicompat;
|
||||
|
||||
tasks.push_back(task);
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
task_ids = server_task::get_list_id(tasks);
|
||||
ctx_server.queue_results.add_waiting_tasks(tasks);
|
||||
ctx_server.queue_tasks.post(tasks);
|
||||
|
||||
// get the result
|
||||
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
||||
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
for (auto & res : results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
||||
responses.push_back(res->to_json());
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
|
||||
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
||||
ctx_server.queue_tasks.post(std::move(tasks));
|
||||
}
|
||||
|
||||
// get the result
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
for (auto & res : results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
||||
responses.push_back(res->to_json());
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
|
||||
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
||||
|
||||
if (error) {
|
||||
return;
|
||||
}
|
||||
@@ -4367,6 +4382,7 @@ int main(int argc, char ** argv) {
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
bool error = false;
|
||||
std::unordered_set<int> task_ids;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
|
||||
@@ -4376,26 +4392,24 @@ int main(int argc, char ** argv) {
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.index = i;
|
||||
task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
|
||||
tasks.push_back(task);
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
task_ids = server_task::get_list_id(tasks);
|
||||
ctx_server.queue_results.add_waiting_tasks(tasks);
|
||||
ctx_server.queue_tasks.post(tasks);
|
||||
|
||||
// get the result
|
||||
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
||||
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
for (auto & res : results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
||||
responses.push_back(res->to_json());
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
ctx_server.queue_tasks.post(std::move(tasks));
|
||||
}
|
||||
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
for (auto & res : results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
||||
responses.push_back(res->to_json());
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
|
||||
if (error) {
|
||||
return;
|
||||
}
|
||||
@@ -4431,14 +4445,19 @@ int main(int argc, char ** argv) {
|
||||
res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
||||
task.id = task_id;
|
||||
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
// get the result
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -4582,8 +4601,8 @@ int main(int argc, char ** argv) {
|
||||
common_chat_templates_source(ctx_server.chat_templates.get()),
|
||||
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
||||
|
||||
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
|
||||
ctx_server.process_single_task(task);
|
||||
ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
|
||||
ctx_server.process_single_task(std::move(task));
|
||||
});
|
||||
|
||||
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
|
||||
|
||||
@@ -107,6 +107,7 @@ message(DEBUG "INS_ENB : ${INS_ENB}")
|
||||
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
||||
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
||||
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||
|
||||
@@ -7,6 +7,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 1
|
||||
#define RPC_PROTO_MINOR_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
// backend API
|
||||
|
||||
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
@@ -286,14 +287,16 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(sandybridge AVX)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
||||
else ()
|
||||
elseif (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS /arch:SSE4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
else ()
|
||||
list(APPEND ARCH_FLAGS -msse4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
if (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS -msse4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_F16C)
|
||||
|
||||
@@ -263,7 +263,7 @@ void test_x86_is() {
|
||||
static int ggml_backend_cpu_x86_score() {
|
||||
// FIXME: this does not check for OS support
|
||||
|
||||
int score = 0;
|
||||
int score = 1;
|
||||
cpuid_x86 is;
|
||||
|
||||
#ifdef GGML_FMA
|
||||
|
||||
@@ -3237,6 +3237,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -354,6 +354,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,
|
||||
@@ -362,6 +363,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,
|
||||
@@ -370,6 +372,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96,
|
||||
@@ -378,6 +381,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96,
|
||||
@@ -386,6 +390,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96,
|
||||
@@ -394,6 +399,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96,
|
||||
@@ -402,6 +408,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
|
||||
@@ -437,6 +444,13 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_SET_I32,
|
||||
GGML_METAL_KERNEL_TYPE_SET_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||
@@ -467,6 +481,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_SQRT,
|
||||
GGML_METAL_KERNEL_TYPE_SIN,
|
||||
GGML_METAL_KERNEL_TYPE_COS,
|
||||
GGML_METAL_KERNEL_TYPE_NEG,
|
||||
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
|
||||
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
|
||||
GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
|
||||
@@ -1018,6 +1033,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H192, flash_attn_ext_f16_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK192_HV128, flash_attn_ext_f16_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK576_HV512, flash_attn_ext_f16_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64, flash_attn_ext_bf16_h64, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80, flash_attn_ext_bf16_h80, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96, flash_attn_ext_bf16_h96, has_simdgroup_mm && use_bfloat);
|
||||
@@ -1026,6 +1042,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H192, flash_attn_ext_bf16_h192, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK192_HV128, flash_attn_ext_bf16_hk192_hv128, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256, flash_attn_ext_bf16_h256, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK576_HV512, flash_attn_ext_bf16_hk576_hv512, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, has_simdgroup_mm);
|
||||
@@ -1034,6 +1051,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H192, flash_attn_ext_q4_0_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK192_HV128, flash_attn_ext_q4_0_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256, flash_attn_ext_q4_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK576_HV512, flash_attn_ext_q4_0_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64, flash_attn_ext_q4_1_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80, flash_attn_ext_q4_1_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96, flash_attn_ext_q4_1_h96, has_simdgroup_mm);
|
||||
@@ -1042,6 +1060,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H192, flash_attn_ext_q4_1_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK192_HV128, flash_attn_ext_q4_1_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256, flash_attn_ext_q4_1_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK576_HV512, flash_attn_ext_q4_1_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64, flash_attn_ext_q5_0_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80, flash_attn_ext_q5_0_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96, flash_attn_ext_q5_0_h96, has_simdgroup_mm);
|
||||
@@ -1050,6 +1069,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H192, flash_attn_ext_q5_0_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK192_HV128, flash_attn_ext_q5_0_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256, flash_attn_ext_q5_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK576_HV512, flash_attn_ext_q5_0_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64, flash_attn_ext_q5_1_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80, flash_attn_ext_q5_1_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96, flash_attn_ext_q5_1_h96, has_simdgroup_mm);
|
||||
@@ -1058,6 +1078,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H192, flash_attn_ext_q5_1_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK192_HV128, flash_attn_ext_q5_1_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256, flash_attn_ext_q5_1_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK576_HV512, flash_attn_ext_q5_1_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64, flash_attn_ext_q8_0_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80, flash_attn_ext_q8_0_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96, flash_attn_ext_q8_0_h96, has_simdgroup_mm);
|
||||
@@ -1066,6 +1087,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H192, flash_attn_ext_q8_0_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96, flash_attn_ext_vec_f16_h96, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96, flash_attn_ext_vec_bf16_h96, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96, flash_attn_ext_vec_q4_0_h96, has_simdgroup_reduction);
|
||||
@@ -1101,6 +1123,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_HK576_HV512, flash_attn_ext_vec_f16_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_HK576_HV512, flash_attn_ext_vec_bf16_hk576_hv512, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_HK576_HV512, flash_attn_ext_vec_q4_0_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_HK576_HV512, flash_attn_ext_vec_q4_1_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_HK576_HV512, flash_attn_ext_vec_q5_0_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_HK576_HV512, flash_attn_ext_vec_q5_1_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_HK576_HV512, flash_attn_ext_vec_q8_0_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
@@ -1131,6 +1160,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true);
|
||||
@@ -1292,6 +1322,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_NEG:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
@@ -1365,6 +1396,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
// TODO: not sure if it is worth adding kernels for this size
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek sizes
|
||||
// TODO: disabled for now, until optmized
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
return false;
|
||||
}
|
||||
@@ -1977,6 +2013,18 @@ static void ggml_metal_encode_node(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_NEG:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NEG].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
|
||||
@@ -3857,12 +3905,14 @@ static void ggml_metal_encode_node(
|
||||
// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
|
||||
// for now avoiding mainly to keep the number of templates/kernels a bit lower
|
||||
// these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
|
||||
if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192)) {
|
||||
if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
|
||||
@@ -3885,6 +3935,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64 ].pipeline; break;
|
||||
@@ -3907,6 +3959,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64 ].pipeline; break;
|
||||
@@ -3929,6 +3983,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64 ].pipeline; break;
|
||||
@@ -3951,6 +4007,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64 ].pipeline; break;
|
||||
@@ -3973,6 +4031,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64 ].pipeline; break;
|
||||
@@ -3995,6 +4055,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64 ].pipeline; break;
|
||||
@@ -4114,12 +4176,36 @@ static void ggml_metal_encode_node(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case 576:
|
||||
{
|
||||
if (ne20 == 512) {
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_HK576_HV512].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
|
||||
GGML_LOG_ERROR("add template specialization for this type\n");
|
||||
GGML_ABORT("add template specialization for this type");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_ERROR("unsupported size: %lld\n", ne20);
|
||||
GGML_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
GGML_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
GGML_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -949,6 +949,13 @@ kernel void kernel_cos(
|
||||
dst[tpig] = cos(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_neg(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = -src0[tpig];
|
||||
}
|
||||
|
||||
kernel void kernel_sum_rows(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
@@ -3546,6 +3553,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 576, 512>;
|
||||
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 64, 64>;
|
||||
@@ -3556,6 +3564,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
|
||||
#endif
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64, 64>;
|
||||
@@ -3566,6 +3575,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80, 80>;
|
||||
@@ -3575,6 +3585,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80, 80>;
|
||||
@@ -3584,6 +3595,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80, 80>;
|
||||
@@ -3593,6 +3605,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80, 80>;
|
||||
@@ -3602,6 +3615,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
|
||||
|
||||
#undef FA_TYPES
|
||||
|
||||
@@ -4009,6 +4023,16 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 256, 256, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 256, 256, 4>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 576, 512, 2>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 576, 512, 2>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 576, 512, 2>;
|
||||
|
||||
#undef FA_TYPES
|
||||
|
||||
template<typename T>
|
||||
|
||||
@@ -92,12 +92,19 @@ enum rpc_cmd {
|
||||
RPC_CMD_GET_DEVICE_MEMORY,
|
||||
RPC_CMD_INIT_TENSOR,
|
||||
RPC_CMD_GET_ALLOC_SIZE,
|
||||
RPC_CMD_HELLO,
|
||||
RPC_CMD_COUNT,
|
||||
};
|
||||
|
||||
// Try RPC_CMD_SET_TENSOR_HASH first when data size is larger than this threshold
|
||||
const size_t HASH_THRESHOLD = 10 * 1024 * 1024;
|
||||
|
||||
struct rpc_msg_hello_rsp {
|
||||
uint8_t major;
|
||||
uint8_t minor;
|
||||
uint8_t patch;
|
||||
};
|
||||
|
||||
struct rpc_msg_get_alloc_size_req {
|
||||
rpc_tensor tensor;
|
||||
};
|
||||
@@ -400,6 +407,20 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||
|
||||
// RPC client-side implementation
|
||||
|
||||
static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_hello_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
||||
fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
return false;
|
||||
}
|
||||
if (response.minor != RPC_PROTO_MINOR_VERSION || response.patch != RPC_PROTO_PATCH_VERSION) {
|
||||
fprintf(stderr, "WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
@@ -433,6 +454,9 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
if (sock == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!check_server_version(sock)) {
|
||||
return nullptr;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||
sockets[endpoint] = sock;
|
||||
return sock;
|
||||
@@ -818,6 +842,7 @@ public:
|
||||
}
|
||||
~rpc_server();
|
||||
|
||||
void hello(rpc_msg_hello_rsp & response);
|
||||
void alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response);
|
||||
void get_alignment(rpc_msg_get_alignment_rsp & response);
|
||||
void get_max_size(rpc_msg_get_max_size_rsp & response);
|
||||
@@ -846,6 +871,13 @@ private:
|
||||
std::unordered_set<ggml_backend_buffer_t> buffers;
|
||||
};
|
||||
|
||||
void rpc_server::hello(rpc_msg_hello_rsp & response) {
|
||||
response.major = RPC_PROTO_MAJOR_VERSION;
|
||||
response.minor = RPC_PROTO_MINOR_VERSION;
|
||||
response.patch = RPC_PROTO_PATCH_VERSION;
|
||||
GGML_PRINT_DEBUG("[%s] version: %d.%d.%d\n", __func__, response.major, response.minor, response.patch);
|
||||
}
|
||||
|
||||
bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
|
||||
ggml_backend_buffer_type_t buft;
|
||||
struct ggml_init_params params {
|
||||
@@ -1271,8 +1303,24 @@ rpc_server::~rpc_server() {
|
||||
static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
sockfd_t sockfd, size_t free_mem, size_t total_mem) {
|
||||
rpc_server server(backend, cache_dir);
|
||||
uint8_t cmd;
|
||||
if (!recv_data(sockfd, &cmd, 1)) {
|
||||
return;
|
||||
}
|
||||
// the first command sent by the client must be HELLO
|
||||
if (cmd != RPC_CMD_HELLO) {
|
||||
fprintf(stderr, "Expected HELLO command, update client\n");
|
||||
return;
|
||||
}
|
||||
if (!recv_msg(sockfd, nullptr, 0)) {
|
||||
return;
|
||||
}
|
||||
rpc_msg_hello_rsp response;
|
||||
server.hello(response);
|
||||
if (!send_msg(sockfd, &response, sizeof(response))) {
|
||||
return;
|
||||
}
|
||||
while (true) {
|
||||
uint8_t cmd;
|
||||
if (!recv_data(sockfd, &cmd, 1)) {
|
||||
break;
|
||||
}
|
||||
@@ -1282,6 +1330,10 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
break;
|
||||
}
|
||||
switch (cmd) {
|
||||
case RPC_CMD_HELLO: {
|
||||
// HELLO command is handled above
|
||||
return;
|
||||
}
|
||||
case RPC_CMD_ALLOC_BUFFER: {
|
||||
rpc_msg_alloc_buffer_req request;
|
||||
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#ifndef GGML_SYCL_BACKEND_HPP
|
||||
#define GGML_SYCL_BACKEND_HPP
|
||||
|
||||
#include "binbcast.hpp"
|
||||
#include "concat.hpp"
|
||||
#include "common.hpp"
|
||||
#include "conv.hpp"
|
||||
|
||||
@@ -0,0 +1,350 @@
|
||||
#include "binbcast.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) /
|
||||
ne3;
|
||||
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) %
|
||||
ne3;
|
||||
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
for (int i0 = i0s; i0 < ne0;
|
||||
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
}
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
const int i3 = i/(ne2*ne1*ne0);
|
||||
const int i2 = (i/(ne1*ne0)) % ne2;
|
||||
const int i1 = (i/ne0) % ne1;
|
||||
const int i0 = i % ne0;
|
||||
|
||||
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
|
||||
|
||||
template<float (*bin_op)(const float, const float)>
|
||||
struct bin_bcast_sycl {
|
||||
template <typename src0_t, typename src1_t, typename dst_t>
|
||||
void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
|
||||
const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
|
||||
const int64_t ne12, const int64_t ne13, const int64_t ne0, const int64_t ne1, const int64_t ne2,
|
||||
const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
|
||||
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
|
||||
int nr0 = ne10 / ne0;
|
||||
int nr1 = ne11/ne1;
|
||||
int nr2 = ne12/ne2;
|
||||
int nr3 = ne13/ne3;
|
||||
|
||||
int nr[4] = { nr0, nr1, nr2, nr3 };
|
||||
|
||||
// collapse dimensions until first broadcast dimension
|
||||
int64_t cne[] = {ne0, ne1, ne2, ne3};
|
||||
int64_t cne0[] = {ne00, ne01, ne02, ne03};
|
||||
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
||||
size_t cnb[] = {nb0, nb1, nb2, nb3};
|
||||
size_t cnb0[] = {nb00, nb01, nb02, nb03};
|
||||
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
||||
auto collapse = [](int64_t cne[]) {
|
||||
cne[0] *= cne[1];
|
||||
cne[1] = cne[2];
|
||||
cne[2] = cne[3];
|
||||
cne[3] = 1;
|
||||
};
|
||||
|
||||
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
||||
cnb[1] *= cne[1];
|
||||
cnb[2] *= cne[2];
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb, cne);
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
int64_t ne0 = cne[0];
|
||||
int64_t ne1 = cne[1];
|
||||
int64_t ne2 = cne[2];
|
||||
int64_t ne3 = cne[3];
|
||||
|
||||
int64_t ne10 = cne1[0];
|
||||
int64_t ne11 = cne1[1];
|
||||
int64_t ne12 = cne1[2];
|
||||
int64_t ne13 = cne1[3];
|
||||
|
||||
size_t nb0 = cnb[0];
|
||||
size_t nb1 = cnb[1];
|
||||
size_t nb2 = cnb[2];
|
||||
size_t nb3 = cnb[3];
|
||||
|
||||
size_t nb00 = cnb0[0];
|
||||
size_t nb01 = cnb0[1];
|
||||
size_t nb02 = cnb0[2];
|
||||
size_t nb03 = cnb0[3];
|
||||
|
||||
size_t nb10 = cnb1[0];
|
||||
size_t nb11 = cnb1[1];
|
||||
size_t nb12 = cnb1[2];
|
||||
size_t nb13 = cnb1[3];
|
||||
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
size_t s10 = nb10 / sizeof(src1_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_UNUSED(s00);
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
||||
|
||||
sycl::range<3> block_dims(1, 1, 1);
|
||||
block_dims[2] = std::min<unsigned int>(hne0, block_size);
|
||||
block_dims[1] = std::min<unsigned int>(
|
||||
ne1, block_size / (unsigned int)block_dims[2]);
|
||||
block_dims[0] = std::min(
|
||||
std::min<unsigned int>(
|
||||
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
|
||||
(unsigned int)block_dims[1]),
|
||||
64U);
|
||||
|
||||
sycl::range<3> block_nums(
|
||||
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
|
||||
(ne1 + block_dims[1] - 1) / block_dims[1],
|
||||
(hne0 + block_dims[2] - 1) / block_dims[2]);
|
||||
|
||||
if (block_nums[0] > 65535) {
|
||||
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
||||
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
|
||||
sycl::range<3>(1, 1, block_size),
|
||||
sycl::range<3>(1, 1, block_size)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
|
||||
s03, s11, s12, s13, item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:16: The work-group size passed to the SYCL kernel may
|
||||
exceed the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if
|
||||
needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <class op>
|
||||
inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
|
||||
ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
|
||||
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
|
||||
ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
|
||||
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst),
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
|
||||
ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
|
||||
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
||||
op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
|
||||
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
|
||||
op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
|
||||
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, dst->src[0], dst);
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_add(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_sub(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_mul(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_div(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_repeat(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
#ifndef GGML_SYCL_BINBCAST_HPP
|
||||
#define GGML_SYCL_BINBCAST_HPP
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
GGML_UNUSED(a);
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_add(const float a, const float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_sub(const float a, const float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_mul(const float a, const float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_div(const float a, const float b) {
|
||||
return a / b;
|
||||
}
|
||||
|
||||
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
|
||||
#endif //GGML_SYCL_BINBCAST_HPP
|
||||
|
||||
@@ -494,286 +494,5 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
|
||||
|
||||
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) /
|
||||
ne3;
|
||||
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) %
|
||||
ne3;
|
||||
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
for (int i0 = i0s; i0 < ne0;
|
||||
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
}
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
const int i3 = i/(ne2*ne1*ne0);
|
||||
const int i2 = (i/(ne1*ne0)) % ne2;
|
||||
const int i1 = (i/ne0) % ne1;
|
||||
const int i0 = i % ne0;
|
||||
|
||||
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
|
||||
|
||||
template<float (*bin_op)(const float, const float)>
|
||||
struct bin_bcast_sycl {
|
||||
template <typename src0_t, typename src1_t, typename dst_t>
|
||||
void operator()(ggml_backend_sycl_context & ctx,
|
||||
const struct ggml_tensor *src0,
|
||||
const struct ggml_tensor *src1, struct ggml_tensor *dst,
|
||||
const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
|
||||
queue_ptr stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
int nr0 = ne10/ne0;
|
||||
int nr1 = ne11/ne1;
|
||||
int nr2 = ne12/ne2;
|
||||
int nr3 = ne13/ne3;
|
||||
|
||||
int nr[4] = { nr0, nr1, nr2, nr3 };
|
||||
|
||||
// collapse dimensions until first broadcast dimension
|
||||
int64_t cne[] = {ne0, ne1, ne2, ne3};
|
||||
int64_t cne0[] = {ne00, ne01, ne02, ne03};
|
||||
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
||||
size_t cnb[] = {nb0, nb1, nb2, nb3};
|
||||
size_t cnb0[] = {nb00, nb01, nb02, nb03};
|
||||
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
||||
auto collapse = [](int64_t cne[]) {
|
||||
cne[0] *= cne[1];
|
||||
cne[1] = cne[2];
|
||||
cne[2] = cne[3];
|
||||
cne[3] = 1;
|
||||
};
|
||||
|
||||
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
||||
cnb[1] *= cne[1];
|
||||
cnb[2] *= cne[2];
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb, cne);
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
int64_t ne0 = cne[0];
|
||||
int64_t ne1 = cne[1];
|
||||
int64_t ne2 = cne[2];
|
||||
int64_t ne3 = cne[3];
|
||||
|
||||
int64_t ne10 = cne1[0];
|
||||
int64_t ne11 = cne1[1];
|
||||
int64_t ne12 = cne1[2];
|
||||
int64_t ne13 = cne1[3];
|
||||
|
||||
size_t nb0 = cnb[0];
|
||||
size_t nb1 = cnb[1];
|
||||
size_t nb2 = cnb[2];
|
||||
size_t nb3 = cnb[3];
|
||||
|
||||
size_t nb00 = cnb0[0];
|
||||
size_t nb01 = cnb0[1];
|
||||
size_t nb02 = cnb0[2];
|
||||
size_t nb03 = cnb0[3];
|
||||
|
||||
size_t nb10 = cnb1[0];
|
||||
size_t nb11 = cnb1[1];
|
||||
size_t nb12 = cnb1[2];
|
||||
size_t nb13 = cnb1[3];
|
||||
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
size_t s10 = nb10 / sizeof(src1_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_UNUSED(s00);
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
||||
|
||||
sycl::range<3> block_dims(1, 1, 1);
|
||||
block_dims[2] = std::min<unsigned int>(hne0, block_size);
|
||||
block_dims[1] = std::min<unsigned int>(
|
||||
ne1, block_size / (unsigned int)block_dims[2]);
|
||||
block_dims[0] = std::min(
|
||||
std::min<unsigned int>(
|
||||
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
|
||||
(unsigned int)block_dims[1]),
|
||||
64U);
|
||||
|
||||
sycl::range<3> block_nums(
|
||||
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
|
||||
(ne1 + block_dims[1] - 1) / block_dims[1],
|
||||
(hne0 + block_dims[2] - 1) / block_dims[2]);
|
||||
|
||||
if (block_nums[0] > 65535) {
|
||||
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
||||
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
|
||||
sycl::range<3>(1, 1, block_size),
|
||||
sycl::range<3>(1, 1, block_size)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
|
||||
s03, s11, s12, s13, item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:16: The work-group size passed to the SYCL kernel may
|
||||
exceed the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if
|
||||
needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
GGML_UNUSED(ctx);
|
||||
}
|
||||
};
|
||||
|
||||
template <class op>
|
||||
inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst) {
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
op()(ctx, src0, src1, dst, (const float *)src0->data, (const float *)src1->data, (float *)dst->data, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data,
|
||||
(sycl::half *)dst->data, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data, (float *)dst->data,
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
||||
op()(ctx, src0, src1, dst, (const int32_t *)src0->data, (const int32_t *)src1->data, (int32_t *)dst->data,
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
|
||||
op()(ctx, src0, src1, dst, (const int16_t *)src0->data, (const int16_t *)src1->data, (int16_t *)dst->data,
|
||||
main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
@@ -1261,27 +1261,6 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
}
|
||||
|
||||
|
||||
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
ggml_sycl_op_sqrt(ctx, dst);
|
||||
@@ -1409,29 +1388,3 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_add(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_sub(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_mul(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_div(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
@@ -10,27 +10,6 @@ T neg_infinity() {
|
||||
return -std::numeric_limits<T>::infinity();
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
GGML_UNUSED(a);
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_add(const float a, const float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_sub(const float a, const float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_mul(const float a, const float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_div(const float a, const float b) {
|
||||
return a / b;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
struct typed_data {
|
||||
const T * src;
|
||||
@@ -87,14 +66,5 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
// ---------
|
||||
|
||||
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_ELEMENTWISE_HPP
|
||||
|
||||
|
||||
@@ -1967,11 +1967,6 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, dst->src[0], dst);
|
||||
}
|
||||
|
||||
|
||||
inline void ggml_sycl_op_mul_mat_sycl(
|
||||
ggml_backend_sycl_context & ctx,
|
||||
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||
@@ -2600,12 +2595,6 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
|
||||
static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_repeat(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_get_rows(ctx, dst);
|
||||
@@ -3179,11 +3168,6 @@ static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_op_diag_mask_inf(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented
|
||||
ggml_sycl_op_rope(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_pool2d(ctx, dst);
|
||||
}
|
||||
@@ -3972,7 +3956,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
@@ -3982,7 +3965,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
return (op->src[0]->type == GGML_TYPE_F32);
|
||||
case GGML_OP_REPEAT:
|
||||
return true;
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
@@ -4013,7 +3997,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (mode == GGML_ROPE_TYPE_MROPE) {
|
||||
return false;
|
||||
}
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
|
||||
+94
-103
@@ -34,23 +34,21 @@ static void rope_yarn(
|
||||
*sin_theta = sycl::sin(theta) * mscale;
|
||||
}
|
||||
|
||||
template<typename T, bool has_ff>
|
||||
static void rope_norm(
|
||||
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
template <typename T, bool has_ff>
|
||||
static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * pos, float freq_scale, float ext_factor, float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1));
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row*ne0 + i0;
|
||||
const int i = row * ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
@@ -58,42 +56,43 @@ static void rope_norm(
|
||||
return;
|
||||
}
|
||||
|
||||
const int i = row*ne0 + i0;
|
||||
const int i2 = row/p_delta_rows;
|
||||
const int row0 = row % ne1;
|
||||
const int channel0 = row / ne1;
|
||||
|
||||
const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
|
||||
const int i = row * ne0 + i0;
|
||||
const int i2 = channel0 * s2 + row0 * s1 + i0;
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
const float theta_base = pos[channel0] * sycl::pow(theta_scale, i0 / 2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + 1];
|
||||
const float x0 = x[i2 + 0];
|
||||
const float x1 = x[i2 + 1];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
dst[i + 0] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst[i + 1] = x0 * sin_theta + x1 * cos_theta;
|
||||
}
|
||||
|
||||
template<typename T, bool has_ff>
|
||||
static void rope_neox(
|
||||
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
template <typename T, bool has_ff>
|
||||
static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1));
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row*ne0 + i0;
|
||||
const int i = row * ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
@@ -101,23 +100,26 @@ static void rope_neox(
|
||||
return;
|
||||
}
|
||||
|
||||
const int i = row*ne0 + i0/2;
|
||||
const int i2 = row/p_delta_rows;
|
||||
const int row0 = row % ne1;
|
||||
const int channel0 = row / ne1;
|
||||
|
||||
const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
|
||||
const int i = row * ne0 + i0 / 2;
|
||||
const int i2 = channel0 * s2 + row0 * s1 + i0 / 2;
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
const float theta_base = pos[channel0] * sycl::pow(theta_scale, i0 / 2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + n_dims/2];
|
||||
const float x0 = x[i2 + 0];
|
||||
const float x1 = x[i2 + n_dims / 2];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
dst[i + 0] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst[i + n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
|
||||
}
|
||||
|
||||
template <typename T, bool has_ff>
|
||||
@@ -163,18 +165,18 @@ static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, cons
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void rope_norm_sycl(
|
||||
const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
||||
static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2,
|
||||
const int n_dims, int nr, const int32_t * pos, const float freq_scale, const float freq_base,
|
||||
const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
|
||||
const float * freq_factors, queue_ptr stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||
const int num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
/*
|
||||
@@ -182,61 +184,47 @@ static void rope_norm_sycl(
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
|
||||
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
|
||||
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void rope_neox_sycl(
|
||||
const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
||||
static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2,
|
||||
const int n_dims, const int nr, const int32_t * pos, const float freq_scale,
|
||||
const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||
const int num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
} else {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -272,7 +260,7 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
@@ -329,43 +317,46 @@ void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
if (is_neox) {
|
||||
GGML_SYCL_DEBUG("%s: neox path\n", __func__);
|
||||
if (dst->src[0]->type == GGML_TYPE_F32) {
|
||||
rope_neox_sycl(
|
||||
(const float *)dst->src[0]->data, (float *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
rope_neox_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, s01, s02, n_dims, nr,
|
||||
pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
rope_neox_sycl(
|
||||
(const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
rope_neox_sycl((const sycl::half *) dst->src[0]->data, (sycl::half *) dst->data, ne00, ne01, s01, s02,
|
||||
n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors,
|
||||
main_stream);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (is_vision) {
|
||||
GGML_SYCL_DEBUG("%s: vision path\n", __func__);
|
||||
if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
rope_vision_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
|
||||
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, main_stream);
|
||||
rope_vision_sycl((const sycl::half *) dst->src[0]->data, (sycl::half *) dst->data, ne00, ne01, ne02, s01,
|
||||
s02, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, sections, main_stream);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F32) {
|
||||
rope_vision_sycl((const float *) dst->src[0]->data, (float *)dst->data, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
|
||||
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, main_stream);
|
||||
rope_vision_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, ne02, s01, s02, n_dims,
|
||||
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections,
|
||||
main_stream);
|
||||
} else {
|
||||
GGML_ABORT("Fatal error: Tensor type unsupported!");
|
||||
}
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("%s: norm path\n", __func__);
|
||||
if (dst->src[0]->type == GGML_TYPE_F32) {
|
||||
rope_norm_sycl(
|
||||
(const float *)dst->src[0]->data, (float *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
rope_norm_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, s01, s02, n_dims, nr,
|
||||
pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
rope_norm_sycl(
|
||||
(const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
rope_norm_sycl((const sycl::half *) dst->src[0]->data, (sycl::half *) dst->data, ne00, ne01, s01, s02,
|
||||
n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors,
|
||||
main_stream);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_rope(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,6 @@
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
|
||||
void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
|
||||
|
||||
#endif // GGML_SYCL_ROPE_HPP
|
||||
|
||||
@@ -2397,7 +2397,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
@@ -6006,6 +6006,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -6216,7 +6217,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
@@ -6233,6 +6233,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
elements = { nr, 1, 1 };
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 };
|
||||
break;
|
||||
|
||||
case GGML_OP_SUM:
|
||||
// We use GGML_OP_SUM_ROWS with 1 row.
|
||||
elements = { 1, 1, 1 };
|
||||
@@ -6883,7 +6887,17 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
op_params[0], 0.0f,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
@@ -9261,6 +9275,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case 112:
|
||||
case 128:
|
||||
case 256:
|
||||
case 575: // DeepSeek MLA
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -9387,10 +9402,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_ADD:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
@@ -8,19 +8,29 @@
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
shared FLOAT_TYPE sum[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint ncols = p.ne00;
|
||||
const uint nrows = gl_NumWorkGroups.x;
|
||||
const uint nchannels = gl_NumWorkGroups.y;
|
||||
|
||||
const uint row = gl_WorkGroupID.x;
|
||||
const uint channel = gl_WorkGroupID.y;
|
||||
const uint samp = gl_WorkGroupID.z;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
const uint stride_row = p.nb01;
|
||||
const uint stride_channel = p.nb02;
|
||||
const uint stride_sample = p.nb03;
|
||||
|
||||
uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset();
|
||||
uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset();
|
||||
|
||||
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]);
|
||||
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) {
|
||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[a_offset + col]);
|
||||
sum[tid] += xi * xi;
|
||||
}
|
||||
|
||||
@@ -33,10 +43,10 @@ void main() {
|
||||
barrier();
|
||||
}
|
||||
|
||||
const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(p.KX);
|
||||
const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(ncols);
|
||||
const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col]));
|
||||
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) {
|
||||
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,11 @@ as an example for its usage.
|
||||
pip install gguf
|
||||
```
|
||||
|
||||
Optionally, you can install gguf with the extra 'gui' to enable the visual GGUF editor.
|
||||
```sh
|
||||
pip install gguf[gui]
|
||||
```
|
||||
|
||||
## API Examples/Simple Tools
|
||||
|
||||
[examples/writer.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
||||
@@ -25,6 +30,8 @@ pip install gguf
|
||||
|
||||
[gguf/scripts/gguf_new_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
|
||||
[gguf/scripts/gguf_editor_gui.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_editor_gui.py) — Allows for viewing, editing, adding, or removing metadata values within a GGUF file as well as viewing its tensors with a Qt interface.
|
||||
|
||||
## Development
|
||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||
|
||||
|
||||
+137
-2
@@ -218,17 +218,37 @@ class Keys:
|
||||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
|
||||
class ClipVision:
|
||||
PROJECTOR_TYPE = "clip.projector_type"
|
||||
HAS_VISION_ENCODER = "clip.has_vision_encoder"
|
||||
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
|
||||
IMAGE_SIZE = "clip.vision.image_size"
|
||||
PATCH_SIZE = "clip.vision.patch_size"
|
||||
EMBEDDING_LENGTH = "clip.vision.embedding_length"
|
||||
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
|
||||
PROJECTION_DIM = "clip.vision.projection_dim"
|
||||
BLOCK_COUNT = "clip.vision.block_count"
|
||||
IMAGE_MEAN = "clip.vision.image_mean"
|
||||
IMAGE_STD = "clip.vision.image_std"
|
||||
USE_GELU = "clip.use_gelu"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.vision.attention.head_count"
|
||||
LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon"
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
#
|
||||
|
||||
|
||||
class GGUFType:
|
||||
MODEL = "model"
|
||||
ADAPTER = "adapter"
|
||||
MODEL = "model"
|
||||
ADAPTER = "adapter"
|
||||
CLIP_VISION = "clip-vision"
|
||||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
CLIP_VISION = auto() # dummy arch for clip.cpp
|
||||
LLAMA = auto()
|
||||
LLAMA4 = auto()
|
||||
DECI = auto()
|
||||
@@ -297,6 +317,16 @@ class MODEL_ARCH(IntEnum):
|
||||
BAILINGMOE = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
MLP = auto()
|
||||
LDP = auto()
|
||||
LDPV2 = auto()
|
||||
RESAMPLER = auto()
|
||||
GLM_EDGE = auto()
|
||||
MERGER = auto()
|
||||
GEMMA3 = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
TOKEN_EMBD = auto()
|
||||
TOKEN_EMBD_NORM = auto()
|
||||
@@ -436,9 +466,41 @@ class MODEL_TENSOR(IntEnum):
|
||||
POSNET_ATTN_K = auto()
|
||||
POSNET_ATTN_V = auto()
|
||||
POSNET_ATTN_OUT = auto()
|
||||
# vision
|
||||
V_MMPROJ = auto()
|
||||
V_MMPROJ_FC = auto()
|
||||
V_MMPROJ_MLP = auto()
|
||||
V_MMPROJ_PEG = auto()
|
||||
V_ENC_EMBD_CLS = auto()
|
||||
V_ENC_EMBD_PATCH = auto()
|
||||
V_ENC_EMBD_POS = auto()
|
||||
V_ENC_ATTN_Q = auto()
|
||||
V_ENC_ATTN_K = auto()
|
||||
V_ENC_ATTN_V = auto()
|
||||
V_ENC_INPUT_NORM = auto()
|
||||
V_ENC_OUTPUT = auto()
|
||||
V_ENC_OUTPUT_NORM = auto()
|
||||
V_ENC_FFN_UP = auto()
|
||||
V_ENC_FFN_DOWN = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
V_MM_INP_PROJ = auto() # gemma3
|
||||
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
||||
V_RESMPL_POS_EMBD_K = auto() # minicpmv
|
||||
V_RESMPL_ATTN_Q = auto() # minicpmv
|
||||
V_RESMPL_ATTN_K = auto() # minicpmv
|
||||
V_RESMPL_ATTN_V = auto() # minicpmv
|
||||
V_RESMPL_ATTN_OUT = auto() # minicpmv
|
||||
V_RESMPL_KV = auto() # minicpmv
|
||||
V_RESMPL_KV_NORM = auto() # minicpmv
|
||||
V_RESMPL_POST_NORM = auto() # minicpmv
|
||||
V_RESMPL_Q_NORM = auto() # minicpmv
|
||||
V_RESMPL_PROJ = auto() # minicpmv
|
||||
V_RESMPL_QUERY = auto() # minicpmv
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.LLAMA4: "llama4",
|
||||
MODEL_ARCH.DECI: "deci",
|
||||
@@ -507,6 +569,16 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.BAILINGMOE: "bailingmoe",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
VISION_PROJECTOR_TYPE.MLP: "mlp",
|
||||
VISION_PROJECTOR_TYPE.LDP: "ldp",
|
||||
VISION_PROJECTOR_TYPE.LDPV2: "ldpv2",
|
||||
VISION_PROJECTOR_TYPE.RESAMPLER: "resampler",
|
||||
VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter",
|
||||
VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger",
|
||||
VISION_PROJECTOR_TYPE.GEMMA3: "gemma3",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||
@@ -646,9 +718,72 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
|
||||
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
|
||||
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
|
||||
# vision
|
||||
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
|
||||
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}",
|
||||
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
|
||||
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
|
||||
MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out",
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2",
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out",
|
||||
MODEL_TENSOR.V_RESMPL_KV: "resampler.kv",
|
||||
MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv",
|
||||
MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post",
|
||||
MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q",
|
||||
MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj",
|
||||
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_ARCH.CLIP_VISION: [
|
||||
MODEL_TENSOR.V_MMPROJ,
|
||||
MODEL_TENSOR.V_MMPROJ_FC,
|
||||
MODEL_TENSOR.V_MMPROJ_MLP,
|
||||
MODEL_TENSOR.V_MMPROJ_PEG,
|
||||
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q,
|
||||
MODEL_TENSOR.V_ENC_ATTN_K,
|
||||
MODEL_TENSOR.V_ENC_ATTN_V,
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.V_ENC_OUTPUT,
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
|
||||
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.V_PRE_NORM,
|
||||
MODEL_TENSOR.V_POST_NORM,
|
||||
MODEL_TENSOR.V_MM_INP_PROJ,
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_K,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_V,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_OUT,
|
||||
MODEL_TENSOR.V_RESMPL_KV,
|
||||
MODEL_TENSOR.V_RESMPL_KV_NORM,
|
||||
MODEL_TENSOR.V_RESMPL_POST_NORM,
|
||||
MODEL_TENSOR.V_RESMPL_Q_NORM,
|
||||
MODEL_TENSOR.V_RESMPL_PROJ,
|
||||
MODEL_TENSOR.V_RESMPL_QUERY,
|
||||
],
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -4,3 +4,4 @@ from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
|
||||
from .gguf_dump import main as gguf_dump_entrypoint
|
||||
from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
|
||||
from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
|
||||
from .gguf_editor_gui import main as gguf_editor_gui_entrypoint
|
||||
|
||||
Executable
+1610
File diff suppressed because it is too large
Load Diff
@@ -886,6 +886,150 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.POSNET_ATTN_OUT: (
|
||||
"backbone.posnet.{bid}.proj_out", # wavtokenizer
|
||||
),
|
||||
|
||||
#############################################################################
|
||||
## Vision encoder
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ: (
|
||||
"multi_modal_projector.linear_{bid}",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
"model.connector.modality_projection.proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||
"model.mm_projector.mlp.mlp.{bid}",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_PEG: (
|
||||
"model.mm_projector.peg.peg.{bid}",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_CLS: (
|
||||
"vision_tower.vision_model.embeddings.class_embedding",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
|
||||
"vision_tower.vision_model.embeddings.patch_embedding",
|
||||
"vpm.embeddings.patch_embedding",
|
||||
"model.vision_model.embeddings.patch_embedding", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||
"vision_tower.vision_model.embeddings.position_embedding",
|
||||
"vpm.embeddings.position_embedding",
|
||||
"model.vision_model.embeddings.position_embedding", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
"vpm.encoder.layers.{bid}.layer_norm1",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"vpm.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"vpm.encoder.layers.{bid}.mlp.fc1",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"vpm.encoder.layers.{bid}.mlp.fc2",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_PRE_NORM: (
|
||||
"vision_tower.vision_model.pre_layrnorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_POST_NORM: (
|
||||
"vision_tower.vision_model.post_layernorm",
|
||||
"model.vision_model.post_layernorm", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||
"multi_modal_projector.mm_input_projection",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
|
||||
"multi_modal_projector.mm_soft_emb_norm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
|
||||
"resampler.pos_embed_k",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q: (
|
||||
"resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_K: (
|
||||
"resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_V: (
|
||||
"resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
|
||||
"resampler.attn.out_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_KV: (
|
||||
"resampler.kv_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_POST_NORM: (
|
||||
"resampler.ln_post",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_KV_NORM: (
|
||||
"resampler.ln_kv",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_Q_NORM: (
|
||||
"resampler.ln_q",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_PROJ: (
|
||||
"resampler.proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_QUERY: (
|
||||
"resampler.query",
|
||||
),
|
||||
}
|
||||
|
||||
# architecture-specific block mappings
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.16.0"
|
||||
version = "0.16.2"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
@@ -23,10 +23,14 @@ numpy = ">=1.17"
|
||||
tqdm = ">=4.27"
|
||||
pyyaml = ">=5.1"
|
||||
sentencepiece = ">=0.1.98,<=0.2.0"
|
||||
PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true }
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
|
||||
[tool.poetry.extras]
|
||||
gui = ["PySide6"]
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
@@ -36,3 +40,4 @@ gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
|
||||
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
|
||||
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
|
||||
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
|
||||
gguf-editor-gui = "gguf.scripts:gguf_editor_gui_entrypoint"
|
||||
|
||||
@@ -11,3 +11,5 @@
|
||||
-r ./requirements-convert_legacy_llama.txt
|
||||
-r ./requirements-convert_llama_ggml_to_gguf.txt
|
||||
-r ./requirements-tool_bench.txt
|
||||
|
||||
-r ./requirements-gguf_editor_gui.txt
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
numpy~=1.26.4
|
||||
PySide6~=6.9.0
|
||||
gguf>=0.16.0
|
||||
@@ -121,6 +121,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
||||
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
||||
} else if (tmpl_contains("bos_token + message['role']")) {
|
||||
|
||||
@@ -484,7 +484,7 @@ ggml_tensor * llama_context::build_rope_shift(
|
||||
|
||||
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
const float yarn_attn_factor_scaled = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
|
||||
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
|
||||
|
||||
ggml_tensor * tmp;
|
||||
|
||||
@@ -504,14 +504,14 @@ ggml_tensor * llama_context::build_rope_shift(
|
||||
|
||||
tmp = ggml_rope_ext_inplace(ctx0, tmp,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow);
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
|
||||
tmp = ggml_cpy(ctx0, tmp, cur);
|
||||
} else {
|
||||
// we rotate only the first n_rot dimensions
|
||||
tmp = ggml_rope_ext_inplace(ctx0, cur,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow);
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
}
|
||||
|
||||
return tmp;
|
||||
@@ -2278,11 +2278,6 @@ llama_context * llama_init_from_model(
|
||||
params.flash_attn = false;
|
||||
}
|
||||
|
||||
if (params.flash_attn && model->arch == LLM_ARCH_DEEPSEEK2) {
|
||||
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Deepseek2 - forcing off\n", __func__);
|
||||
params.flash_attn = false;
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
|
||||
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
||||
return nullptr;
|
||||
|
||||
+8
-6
@@ -1200,9 +1200,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
//const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
// note: for MLA with the absorption optimization, the final embedding size will be changed via v_mla
|
||||
const auto n_embd_head_v = v_mla == nullptr ? v_trans ? v->ne[1] : v->ne[0] : v_mla->ne[1];
|
||||
|
||||
const auto n_tokens = q->ne[1];
|
||||
const auto n_head = q->ne[2];
|
||||
const auto n_kv = k->ne[1];
|
||||
@@ -1231,7 +1228,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
|
||||
if (v_mla) {
|
||||
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||
} else {
|
||||
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
||||
|
||||
@@ -1274,9 +1276,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
|
||||
}
|
||||
|
||||
ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||
|
||||
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
||||
cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||
|
||||
if (!cparams.offload_kqv) {
|
||||
// all nodes between the KV store and the attention output are run on the CPU
|
||||
|
||||
+3
-3
@@ -10050,7 +10050,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
||||
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
|
||||
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
@@ -10127,13 +10127,13 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||
|
||||
q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(q_pe, "q_pe", il);
|
||||
|
||||
k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(k_pe, "k_pe", il);
|
||||
|
||||
|
||||
@@ -4428,10 +4428,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_timestep_embedding());
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
||||
for (int hsk : { 64, 80, 128, 192, 256, }) {
|
||||
for (int hsv : { 64, 80, 128, 192, 256, }) {
|
||||
if (hsk != 192 && hsk != hsv) continue;
|
||||
for (int hsk : { 64, 80, 128, 192, 256, 576 }) {
|
||||
for (int hsv : { 64, 80, 128, 192, 256, 512 }) {
|
||||
if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
|
||||
if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
|
||||
if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
|
||||
|
||||
for (bool mask : { true, false } ) {
|
||||
for (float max_bias : { 0.0f, 8.0f }) {
|
||||
|
||||
Reference in New Issue
Block a user