mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 01:57:43 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7841fc723e | |||
| bf69cfe62f | |||
| 10f2e81809 | |||
| ba7654380a | |||
| 6ab2e4765a | |||
| 96e1280839 | |||
| 2c9f833d17 | |||
| 251364549f |
@@ -836,7 +836,7 @@ ifdef GGML_MUSA
|
||||
else
|
||||
MUSA_PATH ?= /opt/musa
|
||||
endif
|
||||
MUSA_ARCHITECTURES ?= 21;22
|
||||
MUSA_ARCHITECTURES ?= 21;22;31
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
|
||||
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
|
||||
|
||||
@@ -861,6 +861,9 @@ class Model:
|
||||
for token_id, token_data in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
token: str = token_data["content"]
|
||||
if token_id >= vocab_size:
|
||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||
if tokens[token_id] != token.encode("utf-8"):
|
||||
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
||||
@@ -3322,6 +3325,83 @@ class Gemma2Model(Model):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
||||
class Gemma3Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA3
|
||||
has_vision: bool = False
|
||||
|
||||
# we need to merge the text_config into the root level of hparams
|
||||
def __init__(self, *args, **kwargs):
|
||||
hparams = Model.load_hparams(kwargs["dir_model"])
|
||||
if "text_config" in hparams:
|
||||
hparams = {**hparams, **hparams["text_config"]}
|
||||
kwargs["hparams"] = hparams
|
||||
super().__init__(*args, **kwargs)
|
||||
if "vision_config" in hparams:
|
||||
logger.info("Has vision encoder, but it will be ignored")
|
||||
self.has_vision = True
|
||||
|
||||
def write(self):
|
||||
super().write()
|
||||
if self.has_vision:
|
||||
logger.info("NOTE: this script only convert the language model to GGUF")
|
||||
logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py")
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
# some default values are not specified in the hparams
|
||||
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
||||
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
||||
# both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
|
||||
assert hparams.get("attn_logit_softcapping") is None
|
||||
assert hparams.get("final_logit_softcapping") is None
|
||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
||||
if hparams.get("rope_scaling") is not None:
|
||||
assert hparams["rope_scaling"]["rope_type"] == "linear"
|
||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.startswith("language_model."):
|
||||
name = name.replace("language_model.", "")
|
||||
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
||||
or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later
|
||||
# ignore vision tensors
|
||||
return []
|
||||
|
||||
# remove OOV (out-of-vocabulary) rows in token_embd
|
||||
if "embed_tokens.weight" in name:
|
||||
vocab = self._create_vocab_sentencepiece()
|
||||
tokens = vocab[0]
|
||||
data_torch = data_torch[:len(tokens)]
|
||||
|
||||
# ref code in Gemma3RMSNorm
|
||||
# output = output * (1.0 + self.weight.float())
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch = data_torch + 1
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Starcoder2ForCausalLM")
|
||||
class StarCoder2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||
|
||||
+36
-12
@@ -197,29 +197,53 @@ The following compilation options are also available to tweak performance:
|
||||
|
||||
## MUSA
|
||||
|
||||
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
|
||||
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
|
||||
|
||||
- Using `CMake`:
|
||||
#### Download directly from Moore Threads
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON
|
||||
cmake --build build --config Release
|
||||
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
|
||||
|
||||
### Compilation
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
#### Override Compute Capability Specifications
|
||||
|
||||
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
|
||||
```
|
||||
|
||||
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
|
||||
|
||||
#### Compilation options
|
||||
|
||||
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
||||
|
||||
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
|
||||
```
|
||||
|
||||
For static build:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON \
|
||||
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
|
||||
### Runtime MUSA environmental variables
|
||||
|
||||
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
|
||||
|
||||
```bash
|
||||
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
|
||||
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
|
||||
```
|
||||
|
||||
### Unified Memory
|
||||
|
||||
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
|
||||
|
||||
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
||||
|
||||
## HIP
|
||||
|
||||
This provides GPU acceleration on HIP-supported AMD GPUs.
|
||||
|
||||
@@ -51,6 +51,13 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-gemma3-cli)
|
||||
add_executable(${TARGET} gemma3-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-llava-clip-quantize-cli)
|
||||
add_executable(${TARGET} clip-quantize-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
# Gemma 3 vision
|
||||
|
||||
> [!IMPORTANT]
|
||||
>
|
||||
> This is very experimental, only used for demo purpose.
|
||||
|
||||
## How to get mmproj.gguf?
|
||||
|
||||
```bash
|
||||
cd gemma-3-4b-it
|
||||
python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
|
||||
|
||||
# output file is mmproj.gguf
|
||||
```
|
||||
|
||||
## How to run it?
|
||||
|
||||
What you need:
|
||||
- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
|
||||
- The mmproj file from step above
|
||||
- An image file
|
||||
|
||||
```bash
|
||||
# build
|
||||
cmake -B build
|
||||
cmake --build build --target llama-gemma3-cli
|
||||
|
||||
# run it
|
||||
./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
|
||||
```
|
||||
+275
-84
@@ -4,31 +4,12 @@
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
//#ifdef GGML_USE_CUDA
|
||||
//#include "ggml-cuda.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_SYCL
|
||||
//#include "ggml-sycl.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_METAL
|
||||
//#include "ggml-metal.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_CANN
|
||||
//#include "ggml-cann.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_VULKAN
|
||||
//#include "ggml-vulkan.h"
|
||||
//#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
@@ -155,6 +136,8 @@ static std::string format(const char * fmt, ...) {
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
|
||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||
#define TN_MINICPMV_QUERY "resampler.query"
|
||||
@@ -181,6 +164,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_GLM_EDGE,
|
||||
PROJECTOR_TYPE_MERGER,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -191,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
||||
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
};
|
||||
|
||||
|
||||
@@ -317,7 +302,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
||||
return kv.first;
|
||||
}
|
||||
}
|
||||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
|
||||
}
|
||||
|
||||
#ifdef CLIP_DEBUG_FUNCTIONS
|
||||
@@ -574,6 +559,10 @@ struct clip_vision_model {
|
||||
struct ggml_tensor * mm_model_ln_kv_b;
|
||||
struct ggml_tensor * mm_model_ln_post_w;
|
||||
struct ggml_tensor * mm_model_ln_post_b;
|
||||
|
||||
// gemma3
|
||||
struct ggml_tensor * mm_input_proj_w;
|
||||
struct ggml_tensor * mm_soft_emb_norm_w;
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
@@ -588,7 +577,7 @@ struct clip_ctx {
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
int32_t max_feature_layer;
|
||||
int32_t max_feature_layer; // unused in newer models like gemma3
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
@@ -600,21 +589,209 @@ struct clip_ctx {
|
||||
bool has_post_norm = false;
|
||||
bool has_patch_bias = false;
|
||||
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_data;
|
||||
struct gguf_context * ctx_gguf = nullptr;
|
||||
struct ggml_context * ctx_data = nullptr;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
ggml_backend_t backend = nullptr;
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
ggml_backend_buffer_t buf = nullptr;
|
||||
|
||||
struct clip_image_size * load_image_size;
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
struct clip_image_size * load_image_size = nullptr;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
backend = ctx_params.use_gpu
|
||||
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
|
||||
: nullptr;
|
||||
|
||||
if (backend) {
|
||||
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
|
||||
backend_ptrs.push_back(backend);
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
||||
} else {
|
||||
backend = backend_cpu;
|
||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
backend_ptrs.push_back(backend_cpu);
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
|
||||
|
||||
sched.reset(
|
||||
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
|
||||
);
|
||||
}
|
||||
|
||||
~clip_ctx() {
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_backend_buffer_free(buf);
|
||||
ggml_backend_free(backend);
|
||||
if (backend_cpu != backend) {
|
||||
ggml_backend_free(backend_cpu);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
const auto & model = ctx->vision_model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
int image_size_width = image_size;
|
||||
int image_size_height = image_size;
|
||||
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
|
||||
GGML_ASSERT(imgs->size == 1); // batch_size == 1
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
// input raw
|
||||
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
|
||||
ggml_set_name(inp_raw, "inp_raw");
|
||||
ggml_set_input(inp_raw);
|
||||
|
||||
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
|
||||
// position embeddings
|
||||
struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||
|
||||
// layernorm1
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
||||
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
|
||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||
|
||||
struct ggml_tensor * K =
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
||||
|
||||
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
|
||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
|
||||
|
||||
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
|
||||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
|
||||
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
|
||||
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
|
||||
}
|
||||
|
||||
// attention output
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, embeddings);
|
||||
|
||||
embeddings = cur; // embeddings = residual, cur = hidden_states
|
||||
|
||||
// layernorm2
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
|
||||
}
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
|
||||
|
||||
// siglip uses gelu
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, embeddings, cur);
|
||||
|
||||
embeddings = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (ctx->has_post_norm) {
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
ggml_set_name(embeddings, "post_ln");
|
||||
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
||||
}
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
const int batch_size = 1;
|
||||
const int mm_tokens_per_image = 256; // default value for gemma3
|
||||
const int tokens_per_side = sqrt(mm_tokens_per_image);
|
||||
const int patches_per_image = sqrt(num_patches);
|
||||
const int kernel_size = patches_per_image / tokens_per_side;
|
||||
|
||||
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
||||
embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
|
||||
|
||||
// doing a pool2d to reduce the number of output tokens to 256
|
||||
embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
|
||||
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
||||
|
||||
// apply norm before projection
|
||||
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
|
||||
|
||||
// apply projection
|
||||
embeddings = ggml_mul_mat(ctx0,
|
||||
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
||||
embeddings);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
@@ -1160,7 +1337,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
} else {
|
||||
GGML_ABORT("fatel error");
|
||||
}
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
@@ -1182,8 +1360,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
return gf;
|
||||
}
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
return clip_image_build_graph_siglip(ctx, imgs);
|
||||
} else {
|
||||
// TODO: we should have one build_* function per model
|
||||
return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
|
||||
}
|
||||
}
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return clip_init(fname, clip_context_params{
|
||||
/* use_gpu */ true,
|
||||
/* verbosity */ verbosity,
|
||||
});
|
||||
}
|
||||
|
||||
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
|
||||
int verbosity = ctx_params.verbosity;
|
||||
struct ggml_context * meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
@@ -1277,7 +1472,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
clip_ctx * new_clip = new clip_ctx{};
|
||||
clip_ctx * new_clip = new clip_ctx(ctx_params);
|
||||
|
||||
// update projector type
|
||||
{
|
||||
@@ -1296,36 +1491,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
//#ifdef GGML_USE_CUDA
|
||||
// new_clip->backend = ggml_backend_cuda_init(0);
|
||||
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_METAL
|
||||
// new_clip->backend = ggml_backend_metal_init();
|
||||
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_CANN
|
||||
// new_clip->backend = ggml_backend_cann_init(0);
|
||||
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_VULKAN
|
||||
// new_clip->backend = ggml_backend_vk_init(0);
|
||||
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_SYCL
|
||||
// new_clip->backend = ggml_backend_sycl_init(0);
|
||||
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
||||
//#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
// model size and capabilities
|
||||
{
|
||||
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
|
||||
@@ -1363,8 +1528,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
GGML_ASSERT(new_clip->has_vision_encoder);
|
||||
GGML_ASSERT(!new_clip->has_text_encoder);
|
||||
|
||||
idx = get_key_idx(ctx, KEY_USE_GELU);
|
||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||
try {
|
||||
idx = get_key_idx(ctx, KEY_USE_GELU);
|
||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
new_clip->use_gelu = false;
|
||||
}
|
||||
|
||||
try {
|
||||
idx = get_key_idx(ctx, KEY_USE_SILU);
|
||||
@@ -1421,7 +1590,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
|
||||
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
|
||||
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
||||
@@ -1434,7 +1605,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return nullptr;
|
||||
}
|
||||
int num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
} else {
|
||||
@@ -1570,11 +1741,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
} catch(const std::exception& /*e*/) {
|
||||
vision_model.patch_embeddings_0 = nullptr;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
} catch(const std::exception& /*e*/) {
|
||||
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
||||
vision_model.position_embeddings = nullptr;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
||||
} catch(const std::exception& /*e*/) {
|
||||
@@ -1685,6 +1862,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
|
||||
vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
@@ -1720,14 +1901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
// measure mem requirement and allocate
|
||||
{
|
||||
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
batch.data = nullptr;
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
|
||||
for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = new_clip->backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
|
||||
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
|
||||
if (size > 1) {
|
||||
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
size / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new_clip;
|
||||
@@ -2219,7 +2407,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ctx->has_glm_projector) {
|
||||
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
res_imgs->size = 1;
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
clip_image_u8 resized_image;
|
||||
@@ -2408,12 +2596,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
ggml_backend_buffer_free(ctx->params_buffer);
|
||||
ggml_backend_free(ctx->backend);
|
||||
ggml_gallocr_free(ctx->compute_alloc);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -2609,8 +2791,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
||||
|
||||
// set inputs
|
||||
const auto & model = ctx->vision_model;
|
||||
@@ -2749,6 +2932,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
// do nothing
|
||||
}
|
||||
else {
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
|
||||
@@ -2775,11 +2961,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||
}
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
||||
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
|
||||
return false;
|
||||
}
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
@@ -2959,6 +3147,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
return ctx->vision_model.mm_1_b->ne[0];
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
return ctx->vision_model.mm_input_proj_w->ne[0];
|
||||
}
|
||||
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
|
||||
@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
|
||||
size_t size;
|
||||
};
|
||||
|
||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
int verbosity;
|
||||
};
|
||||
|
||||
// deprecated, use clip_init
|
||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
||||
|
||||
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -0,0 +1,341 @@
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "stb_image.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <vector>
|
||||
#include <limits.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
static bool g_is_generating = false;
|
||||
|
||||
/**
|
||||
* Please note that this is NOT a production-ready stuff.
|
||||
* It is a playground for trying Gemma 3 vision capabilities.
|
||||
* For contributors: please keep this code simple and easy to understand.
|
||||
*/
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG(
|
||||
"Experimental CLI for using Gemma 3 vision model\n\n"
|
||||
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
||||
" -m and --mmproj are required\n"
|
||||
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
||||
argv[0]
|
||||
);
|
||||
}
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (g_is_generating) {
|
||||
g_is_generating = false;
|
||||
} else {
|
||||
console::cleanup();
|
||||
LOG("\nInterrupted by user\n");
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
struct gemma3_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
common_init_result llama_init;
|
||||
|
||||
llama_model * model;
|
||||
llama_context * lctx;
|
||||
const llama_vocab * vocab;
|
||||
llama_batch batch;
|
||||
|
||||
int n_threads = 1;
|
||||
llama_pos n_past = 0;
|
||||
|
||||
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init.model.get();
|
||||
lctx = llama_init.context.get();
|
||||
vocab = llama_model_get_vocab(model);
|
||||
n_threads = params.cpuparams.n_threads;
|
||||
batch = llama_batch_init(params.n_batch, 0, 1);
|
||||
init_clip_model(params);
|
||||
}
|
||||
|
||||
void init_clip_model(common_params & params) {
|
||||
const char * clip_path = params.mmproj.c_str();
|
||||
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
|
||||
}
|
||||
|
||||
~gemma3_context() {
|
||||
clip_free(ctx_clip);
|
||||
}
|
||||
};
|
||||
|
||||
struct decode_embd_batch {
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_id_0[0] = seq_id;
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
for (int i = 0; i < n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
|
||||
llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
|
||||
common_batch_clear(ctx.batch);
|
||||
for (llama_token & t : tokens) {
|
||||
common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
|
||||
}
|
||||
if (logits_last) {
|
||||
ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
|
||||
}
|
||||
// LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
|
||||
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||
LOG_ERR("Failed to decode text\n");
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int eval_image(gemma3_context & ctx, std::string & fname) {
|
||||
std::vector<float> image_embd_v;
|
||||
int n_embd = llama_model_n_embd(ctx.model);
|
||||
int n_tokens = 256;
|
||||
image_embd_v.resize(n_tokens * n_embd);
|
||||
|
||||
bool ok;
|
||||
struct clip_image_u8 * img_u8 = clip_image_u8_init();
|
||||
ok = clip_image_load_from_file(fname.c_str(), img_u8);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
||||
clip_image_u8_free(img_u8);
|
||||
return 2; // non-fatal error
|
||||
}
|
||||
|
||||
clip_image_f32_batch batch_f32;
|
||||
ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
LOG("Encoding image %s\n", fname.c_str());
|
||||
ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
return 1;
|
||||
}
|
||||
LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
|
||||
// decode image embeddings
|
||||
int64_t t1 = ggml_time_ms();
|
||||
eval_text(ctx, "<start_of_image>");
|
||||
llama_set_causal_attn(ctx.lctx, false);
|
||||
decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
|
||||
if (llama_decode(ctx.lctx, batch_img.batch)) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
return 1;
|
||||
}
|
||||
ctx.n_past += n_tokens;
|
||||
llama_set_causal_attn(ctx.lctx, true);
|
||||
eval_text(ctx, "<end_of_image>");
|
||||
LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
if (i > n_predict || !g_is_generating) {
|
||||
printf("\n");
|
||||
break;
|
||||
}
|
||||
|
||||
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
|
||||
common_sampler_accept(smpl, token_id, true);
|
||||
|
||||
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
|
||||
printf("\n");
|
||||
break; // end of generation
|
||||
}
|
||||
|
||||
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// eval the token
|
||||
common_batch_clear(ctx.batch);
|
||||
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
|
||||
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||
LOG_ERR("failed to decode token\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty()) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
gemma3_context ctx(params);
|
||||
printf("%s: %s\n", __func__, params.model.c_str());
|
||||
|
||||
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
|
||||
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
||||
|
||||
// ctrl+C handling
|
||||
{
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (eval_text(ctx, "<bos>")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (is_single_turn) {
|
||||
g_is_generating = true;
|
||||
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
||||
return 1;
|
||||
}
|
||||
for (auto & fname : params.image) {
|
||||
if (eval_image(ctx, fname)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
|
||||
return 1;
|
||||
}
|
||||
if (generate_response(ctx, smpl, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
LOG("\n Running in chat mode, available commands:");
|
||||
LOG("\n /image <path> load an image");
|
||||
LOG("\n /clear clear the chat history");
|
||||
LOG("\n /quit or /exit exit the program");
|
||||
LOG("\n");
|
||||
|
||||
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
g_is_generating = false;
|
||||
LOG("\n> ");
|
||||
console::set_display(console::user_input);
|
||||
std::string line;
|
||||
console::readline(line, false);
|
||||
console::set_display(console::reset);
|
||||
line = string_strip(line);
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
if (line == "/quit" || line == "/exit") {
|
||||
break;
|
||||
}
|
||||
if (line == "/clear") {
|
||||
ctx.n_past = 0;
|
||||
llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
|
||||
LOG("Chat history cleared\n\n");
|
||||
continue;
|
||||
}
|
||||
g_is_generating = true;
|
||||
if (line.find("/image") == 0) {
|
||||
std::string image = line.substr(7);
|
||||
int res = eval_image(ctx, image);
|
||||
if (res == 2) {
|
||||
continue; // image not found
|
||||
}
|
||||
if (res) {
|
||||
return 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
|
||||
return 1;
|
||||
}
|
||||
if (generate_response(ctx, smpl, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,307 @@
|
||||
import gguf
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import torch
|
||||
import json
|
||||
import os
|
||||
import numpy as np
|
||||
from typing import cast, ContextManager, Any, Iterator
|
||||
from pathlib import Path
|
||||
from torch import Tensor
|
||||
|
||||
logger = logging.getLogger("gemma3-mmproj")
|
||||
|
||||
|
||||
# (copied from convert_hf_to_gguf.py)
|
||||
# tree of lazy tensors
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
_tensor_type = torch.Tensor
|
||||
# to keep the type-checker happy
|
||||
dtype: torch.dtype
|
||||
shape: torch.Size
|
||||
|
||||
# only used when converting a torch.Tensor to a np.ndarray
|
||||
_dtype_map: dict[torch.dtype, type] = {
|
||||
torch.float16: np.float16,
|
||||
torch.float32: np.float32,
|
||||
}
|
||||
|
||||
# used for safetensors slices
|
||||
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
||||
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
||||
_dtype_str_map: dict[str, torch.dtype] = {
|
||||
"F64": torch.float64,
|
||||
"F32": torch.float32,
|
||||
"BF16": torch.bfloat16,
|
||||
"F16": torch.float16,
|
||||
# "U64": torch.uint64,
|
||||
"I64": torch.int64,
|
||||
# "U32": torch.uint32,
|
||||
"I32": torch.int32,
|
||||
# "U16": torch.uint16,
|
||||
"I16": torch.int16,
|
||||
"U8": torch.uint8,
|
||||
"I8": torch.int8,
|
||||
"BOOL": torch.bool,
|
||||
"F8_E4M3": torch.float8_e4m3fn,
|
||||
"F8_E5M2": torch.float8_e5m2,
|
||||
}
|
||||
|
||||
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||
dtype = self._dtype_map[self.dtype]
|
||||
return gguf.LazyNumpyTensor(
|
||||
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||
args=(self,),
|
||||
func=(lambda s: s.numpy())
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
||||
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||||
|
||||
@classmethod
|
||||
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
||||
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
||||
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
|
||||
if func is torch.Tensor.numpy:
|
||||
return args[0].numpy()
|
||||
|
||||
return cls._wrap_fn(func)(*args, **kwargs)
|
||||
|
||||
|
||||
class Gemma3VisionTower:
|
||||
hparams: dict
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
fname_out: Path
|
||||
ftype: gguf.LlamaFileType
|
||||
|
||||
@staticmethod
|
||||
def load_hparams(dir_model: Path):
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
||||
part_names: list[str] = []
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith(prefix) and filename.endswith(suffix):
|
||||
part_names.append(filename)
|
||||
part_names.sort()
|
||||
return part_names
|
||||
|
||||
def __init__(self,
|
||||
dir_model: Path,
|
||||
fname_out: Path,
|
||||
ftype: gguf.LlamaFileType,
|
||||
is_big_endian: bool,):
|
||||
hparams = Gemma3VisionTower.load_hparams(dir_model)
|
||||
self.hparams = hparams
|
||||
self.fname_out = fname_out
|
||||
self.ftype = ftype
|
||||
endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
|
||||
|
||||
text_config = hparams["text_config"]
|
||||
vision_config = hparams["vision_config"]
|
||||
|
||||
assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
|
||||
assert text_config is not None
|
||||
assert vision_config is not None
|
||||
|
||||
self.gguf_writer.add_string ("clip.projector_type", "gemma3")
|
||||
self.gguf_writer.add_bool ("clip.has_text_encoder", False)
|
||||
self.gguf_writer.add_bool ("clip.has_vision_encoder", True)
|
||||
self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy
|
||||
self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
|
||||
self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
|
||||
# default values taken from HF tranformers code
|
||||
self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5])
|
||||
self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5])
|
||||
self.gguf_writer.add_bool ("clip.use_gelu", True)
|
||||
|
||||
# load tensors
|
||||
for name, data_torch in self.get_tensors(dir_model):
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
self.add_tensor(name, data_torch)
|
||||
|
||||
def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
|
||||
part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
|
||||
tensor_names_from_parts: set[str] = set()
|
||||
for part_name in part_names:
|
||||
logger.info(f"gguf: loading model part '{part_name}'")
|
||||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
|
||||
with ctx as model_part:
|
||||
tensor_names_from_parts.update(model_part.keys())
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part.get_slice(name)
|
||||
data = LazyTorchTensor.from_safetensors_slice(data)
|
||||
yield name, data
|
||||
|
||||
def add_tensor(self, name: str, data_torch: Tensor):
|
||||
is_1d = len(data_torch.shape) == 1
|
||||
is_embd = ".embeddings." in name
|
||||
old_dtype = data_torch.dtype
|
||||
can_quantize = not is_1d and not is_embd
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
# this is to support old checkpoint
|
||||
# TODO: remove this when we have the final model
|
||||
name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
|
||||
name = name.replace("multimodal_projector.", "multi_modal_projector.")
|
||||
|
||||
# filter only vision tensors
|
||||
if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
|
||||
return
|
||||
# prefix
|
||||
name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
|
||||
name = name.replace("vision_tower.vision_model.", "v.")
|
||||
# projector and input embd
|
||||
name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
|
||||
name = name.replace(".embeddings.position_embedding.", ".position_embd.")
|
||||
name = name.replace(
|
||||
"multi_modal_projector.mm_input_projection_weight",
|
||||
"mm.input_projection.weight"
|
||||
)
|
||||
name = name.replace(
|
||||
"multi_modal_projector.mm_soft_emb_norm.weight",
|
||||
"mm.soft_emb_norm.weight"
|
||||
)
|
||||
name = name.replace("post_layernorm.", "post_ln.")
|
||||
# each block
|
||||
name = name.replace(".self_attn.k_proj.", ".attn_k.")
|
||||
name = name.replace(".self_attn.v_proj.", ".attn_v.")
|
||||
name = name.replace(".self_attn.q_proj.", ".attn_q.")
|
||||
name = name.replace(".self_attn.out_proj.", ".attn_out.")
|
||||
name = name.replace(".layer_norm1.", ".ln1.")
|
||||
name = name.replace(".layer_norm2.", ".ln2.")
|
||||
name = name.replace(".mlp.fc1.", ".ffn_down.")
|
||||
name = name.replace(".mlp.fc2.", ".ffn_up.")
|
||||
|
||||
if can_quantize:
|
||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {self.ftype}")
|
||||
|
||||
# corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
|
||||
# the other norm values are part of SigLIP model, and they are already correct
|
||||
# ref code: Gemma3RMSNorm
|
||||
if "soft_emb_norm.weight" in name:
|
||||
logger.info(f"Correcting norm value for '{name}'")
|
||||
data_torch = data_torch + 1
|
||||
|
||||
data = data_torch.numpy()
|
||||
|
||||
try:
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
except Exception as e:
|
||||
logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
|
||||
logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
|
||||
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
|
||||
|
||||
def write(self):
|
||||
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
||||
self.gguf_writer.write_kv_data_to_file()
|
||||
self.gguf_writer.write_tensors_to_file(progress=True)
|
||||
self.gguf_writer.close()
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert Gemma 3 vision tower safetensors to GGUF format",)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path, default="mmproj.gguf",
|
||||
help="path to write to",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
|
||||
help="output format",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
help="model is executed on big endian machine",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file",
|
||||
nargs="?",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", action="store_true",
|
||||
help="increase output verbosity",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.model is None:
|
||||
parser.error("the following arguments are required: model")
|
||||
return args
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
dir_model = args.model
|
||||
|
||||
if not dir_model.is_dir():
|
||||
logger.error(f'Error: {args.model} is not a directory')
|
||||
sys.exit(1)
|
||||
|
||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
}
|
||||
|
||||
logger.info(f"Loading model: {dir_model.name}")
|
||||
|
||||
with torch.inference_mode():
|
||||
gemma3_vision_tower = Gemma3VisionTower(
|
||||
dir_model=dir_model,
|
||||
fname_out=args.outfile,
|
||||
ftype=ftype_map[args.outtype],
|
||||
is_big_endian=args.bigendian,
|
||||
)
|
||||
gemma3_vision_tower.write()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
struct clip_context_params clip_params = {
|
||||
/* use_gpu */ params->n_gpu_layers != 0,
|
||||
/* verbosity */ params->verbosity,
|
||||
};
|
||||
auto * ctx_clip = clip_init(clip_path, clip_params);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
|
||||
@@ -497,7 +497,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
search_paths.push_back(get_executable_path());
|
||||
search_paths.push_back(fs::current_path());
|
||||
} else {
|
||||
search_paths.push_back(user_search_path);
|
||||
search_paths.push_back(fs::u8path(user_search_path));
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
@@ -511,9 +511,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||
for (const auto & entry : dir_it) {
|
||||
if (entry.is_regular_file()) {
|
||||
auto filename = entry.path().filename().native();
|
||||
auto ext = entry.path().extension().native();
|
||||
if (filename.find(file_prefix) == 0 && ext == file_extension) {
|
||||
auto filename = entry.path().filename();
|
||||
auto ext = entry.path().extension();
|
||||
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
||||
dl_handle_ptr handle { dl_load_library(entry) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
||||
@@ -544,7 +544,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
||||
fs::path path = search_path.native() + filename.native();
|
||||
fs::path path = search_path / filename;
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
|
||||
@@ -395,11 +395,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||
|
||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
||||
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3)
|
||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||
#elif defined(RDNA1) || defined(__gfx900__)
|
||||
int tmp1;
|
||||
int tmp2;
|
||||
asm("\n \
|
||||
|
||||
+140
-57
@@ -47,11 +47,89 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
1;
|
||||
}
|
||||
|
||||
enum mmvq_parameter_table_id {
|
||||
MMVQ_PARAMETERS_GENERIC = 0,
|
||||
MMVQ_PARAMETERS_GCN,
|
||||
MMVQ_PARAMETERS_RDNA2
|
||||
};
|
||||
|
||||
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
||||
#if defined(RDNA2) || defined(RDNA3)
|
||||
return MMVQ_PARAMETERS_RDNA2;
|
||||
#elif defined(GCN) || defined(CDNA)
|
||||
return MMVQ_PARAMETERS_GCN;
|
||||
#else
|
||||
return MMVQ_PARAMETERS_GENERIC;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
return MMVQ_PARAMETERS_RDNA2;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return MMVQ_PARAMETERS_GCN;
|
||||
}
|
||||
return MMVQ_PARAMETERS_GENERIC;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC) {
|
||||
switch (ncols_y) {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
return 4;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
return 2;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
} else if (table_id == MMVQ_PARAMETERS_GCN) {
|
||||
switch (ncols_y) {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
return 2;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
|
||||
switch (ncols_y) {
|
||||
case 1:
|
||||
return 1;
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
return 2;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
template <ggml_type type, int ncols_y>
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||
@@ -59,24 +137,20 @@ static __global__ void mul_mat_vec_q(
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
|
||||
constexpr int nwarps = calc_nwarps(ncols_y, table_id);
|
||||
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id);
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
||||
constexpr int nwarps = 1;
|
||||
constexpr int rows_per_cuda_block = 1;
|
||||
#else
|
||||
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
||||
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
||||
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
const int tid = warp_size*threadIdx.y + threadIdx.x;
|
||||
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||
const int blocks_per_row_x = ncols_x / qk;
|
||||
const int blocks_per_col_y = nrows_y / QK8_1;
|
||||
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
||||
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
||||
|
||||
// partial sum for each thread
|
||||
// partial sum for each thread
|
||||
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
||||
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
@@ -96,7 +170,7 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
}
|
||||
|
||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
|
||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size];
|
||||
if (threadIdx.y > 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_y; ++j) {
|
||||
@@ -120,7 +194,7 @@ static __global__ void mul_mat_vec_q(
|
||||
for (int l = 0; l < nwarps-1; ++l) {
|
||||
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
||||
}
|
||||
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
||||
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
||||
}
|
||||
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
||||
@@ -129,6 +203,13 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
}
|
||||
|
||||
static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id);
|
||||
const dim3 block_nums(nblocks, 1, 1);
|
||||
const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1);
|
||||
return {block_nums, block_dims};
|
||||
}
|
||||
|
||||
template <ggml_type type>
|
||||
static void mul_mat_vec_q_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
@@ -137,65 +218,67 @@ static void mul_mat_vec_q_cuda(
|
||||
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
|
||||
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
||||
|
||||
int id = ggml_cuda_get_device();
|
||||
|
||||
int64_t nwarps = 1;
|
||||
int64_t rows_per_cuda_block = 1;
|
||||
|
||||
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
||||
switch(ncols_y) {
|
||||
case 1:
|
||||
nwarps = 4;
|
||||
rows_per_cuda_block = 1;
|
||||
break;
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
nwarps = 4;
|
||||
rows_per_cuda_block = 2;
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
nwarps = 2;
|
||||
rows_per_cuda_block = 2;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
||||
const dim3 block_nums(nblocks, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
|
||||
|
||||
switch (ncols_y) {
|
||||
case 1:
|
||||
mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 2;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 3;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 4;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 5:
|
||||
mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 5;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 6:
|
||||
mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 6;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 7:
|
||||
mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 7;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 8:
|
||||
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 8;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
|
||||
+147
-132
@@ -46,6 +46,7 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
|
||||
static struct ggml_backend_metal_device_context {
|
||||
id<MTLDevice> mtl_device;
|
||||
int mtl_device_ref_count;
|
||||
id<MTLLibrary> mtl_library;
|
||||
|
||||
bool has_simdgroup_reduction;
|
||||
bool has_simdgroup_mm;
|
||||
@@ -57,6 +58,7 @@ static struct ggml_backend_metal_device_context {
|
||||
} g_ggml_ctx_dev_main = {
|
||||
/*.mtl_device =*/ nil,
|
||||
/*.mtl_device_ref_count =*/ 0,
|
||||
/*.mtl_library =*/ nil,
|
||||
/*.has_simdgroup_reduction =*/ false,
|
||||
/*.has_simdgroup_mm =*/ false,
|
||||
/*.has_residency_sets =*/ false,
|
||||
@@ -108,6 +110,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
ctx->mtl_device_ref_count--;
|
||||
|
||||
if (ctx->mtl_device_ref_count == 0) {
|
||||
if (ctx->mtl_library) {
|
||||
[ctx->mtl_library release];
|
||||
ctx->mtl_library = nil;
|
||||
}
|
||||
|
||||
if (ctx->mtl_device) {
|
||||
[ctx->mtl_device release];
|
||||
ctx->mtl_device = nil;
|
||||
@@ -495,6 +502,139 @@ static void * ggml_metal_host_malloc(size_t n) {
|
||||
return data;
|
||||
}
|
||||
|
||||
// load library
|
||||
//
|
||||
// - first check if the library is embedded
|
||||
// - then check if the library is in the bundle
|
||||
// - if not found, load the source and compile it
|
||||
// - if that fails, return NULL
|
||||
static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfloat) {
|
||||
id<MTLLibrary> metal_library = nil;
|
||||
NSError * error = nil;
|
||||
NSString * src = nil;
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
|
||||
#else
|
||||
|
||||
#ifdef SWIFT_PACKAGE
|
||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (path_lib == nil) {
|
||||
// Try to find the resource in the directory where the current binary located.
|
||||
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
|
||||
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
|
||||
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
|
||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||
// Optionally, if this is a symlink, try to resolve it.
|
||||
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
|
||||
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
|
||||
// It is a relative path, adding the binary directory as directory prefix.
|
||||
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
|
||||
}
|
||||
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
// Link to the resource could not be resolved.
|
||||
default_metallib_path = nil;
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The resource couldn't be found in the binary's directory.
|
||||
default_metallib_path = nil;
|
||||
}
|
||||
path_lib = default_metallib_path;
|
||||
}
|
||||
|
||||
if (path_lib != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
|
||||
metal_library = [device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * path_source;
|
||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
|
||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||
|
||||
if (path_resource) {
|
||||
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
||||
} else {
|
||||
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
}
|
||||
|
||||
if (path_source == nil) {
|
||||
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
path_source = @"ggml-metal.metal";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!metal_library) {
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
if (use_bfloat) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
metal_library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
[options release];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
return metal_library;
|
||||
}
|
||||
|
||||
static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
|
||||
GGML_LOG_INFO("%s: allocating\n", __func__);
|
||||
|
||||
@@ -522,136 +662,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
id<MTLLibrary> metal_library = nil;
|
||||
|
||||
// load library
|
||||
//
|
||||
// - first check if the library is embedded
|
||||
// - then check if the library is in the bundle
|
||||
// - if not found, load the source and compile it
|
||||
// - if that fails, return NULL
|
||||
{
|
||||
NSError * error = nil;
|
||||
NSString * src = nil;
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
|
||||
#else
|
||||
|
||||
#ifdef SWIFT_PACKAGE
|
||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (path_lib == nil) {
|
||||
// Try to find the resource in the directory where the current binary located.
|
||||
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
|
||||
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
|
||||
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
|
||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||
// Optionally, if this is a symlink, try to resolve it.
|
||||
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
|
||||
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
|
||||
// It is a relative path, adding the binary directory as directory prefix.
|
||||
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
|
||||
}
|
||||
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
// Link to the resource could not be resolved.
|
||||
default_metallib_path = nil;
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The resource couldn't be found in the binary's directory.
|
||||
default_metallib_path = nil;
|
||||
}
|
||||
path_lib = default_metallib_path;
|
||||
}
|
||||
|
||||
if (path_lib != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
|
||||
metal_library = [device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * path_source;
|
||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
|
||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||
|
||||
if (path_resource) {
|
||||
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
||||
} else {
|
||||
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
}
|
||||
|
||||
if (path_source == nil) {
|
||||
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
path_source = @"ggml-metal.metal";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!metal_library) {
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
if (ctx_dev->use_bfloat) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
metal_library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
[options release];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
if (ctx_dev->mtl_library == nil) {
|
||||
ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
|
||||
}
|
||||
id<MTLLibrary> metal_library = ctx_dev->mtl_library;
|
||||
if (metal_library == nil) {
|
||||
GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// print MTL GPU family:
|
||||
@@ -725,7 +743,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
[metal_function release]; \
|
||||
if (error) { \
|
||||
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
[metal_library release]; \
|
||||
return NULL; \
|
||||
} \
|
||||
} else { \
|
||||
@@ -1044,8 +1061,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
|
||||
}
|
||||
|
||||
[metal_library release];
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ if (MUSAToolkit_FOUND)
|
||||
message(STATUS "MUSA Toolkit found")
|
||||
|
||||
if (NOT DEFINED MUSA_ARCHITECTURES)
|
||||
set(MUSA_ARCHITECTURES "21;22")
|
||||
set(MUSA_ARCHITECTURES "21;22;31")
|
||||
endif()
|
||||
message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}")
|
||||
|
||||
|
||||
@@ -5,23 +5,24 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16];
|
||||
shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16];
|
||||
shared FLOAT_TYPE sccache1[2][BLOCK_SIZE/16][16];
|
||||
shared FLOAT_TYPE sccache2[2][BLOCK_SIZE/16][16];
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
uint csel = 0;
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
csel ^= 1;
|
||||
|
||||
barrier();
|
||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||
if (i < num_blocks_per_row) {
|
||||
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
||||
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||
sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||
sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||
}
|
||||
barrier();
|
||||
|
||||
@@ -29,8 +30,8 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
continue;
|
||||
} else {
|
||||
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
||||
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||
sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||
sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||
barrier();
|
||||
}
|
||||
|
||||
@@ -57,22 +58,22 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[ix][ 8*v_im] * qs_u32_0[l ],
|
||||
fma(FLOAT_TYPE(b16[l]), sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2],
|
||||
fma(FLOAT_TYPE(b32[l]), sccache1[ix][2 + 8*v_im] * qs_u32_2[l ],
|
||||
fma(FLOAT_TYPE(b48[l]), sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2],
|
||||
fma(FLOAT_TYPE(b64[l]), sccache1[ix][4 + 8*v_im] * qs_u32_4[l ],
|
||||
fma(FLOAT_TYPE(b80[l]), sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2],
|
||||
fma(FLOAT_TYPE(b96[l]), sccache1[ix][6 + 8*v_im] * qs_u32_6[l ],
|
||||
fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
|
||||
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[ix][ 8*v_im],
|
||||
fma(FLOAT_TYPE(b16[l]), sccache2[ix][1 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b32[l]), sccache2[ix][2 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b48[l]), sccache2[ix][3 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b64[l]), sccache2[ix][4 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b80[l]), sccache2[ix][5 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b96[l]), sccache2[ix][6 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2))))))));
|
||||
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[csel][ix][ 8*v_im] * qs_u32_0[l ],
|
||||
fma(FLOAT_TYPE(b16[l]), sccache1[csel][ix][1 + 8*v_im] * qs_u32_0[l+2],
|
||||
fma(FLOAT_TYPE(b32[l]), sccache1[csel][ix][2 + 8*v_im] * qs_u32_2[l ],
|
||||
fma(FLOAT_TYPE(b48[l]), sccache1[csel][ix][3 + 8*v_im] * qs_u32_2[l+2],
|
||||
fma(FLOAT_TYPE(b64[l]), sccache1[csel][ix][4 + 8*v_im] * qs_u32_4[l ],
|
||||
fma(FLOAT_TYPE(b80[l]), sccache1[csel][ix][5 + 8*v_im] * qs_u32_4[l+2],
|
||||
fma(FLOAT_TYPE(b96[l]), sccache1[csel][ix][6 + 8*v_im] * qs_u32_6[l ],
|
||||
fma(FLOAT_TYPE(b112[l]), sccache1[csel][ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
|
||||
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[csel][ix][ 8*v_im],
|
||||
fma(FLOAT_TYPE(b16[l]), sccache2[csel][ix][1 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b32[l]), sccache2[csel][ix][2 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b48[l]), sccache2[csel][ix][3 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b64[l]), sccache2[csel][ix][4 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b80[l]), sccache2[csel][ix][5 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b96[l]), sccache2[csel][ix][6 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b112[l]), sccache2[csel][ix][7 + 8*v_im], sum2))))))));
|
||||
}
|
||||
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
|
||||
}
|
||||
|
||||
@@ -5,20 +5,21 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8];
|
||||
shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][2][8];
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
uint csel = 0;
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
csel ^= 1;
|
||||
|
||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||
barrier();
|
||||
if (i < num_blocks_per_row)
|
||||
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||
sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||
barrier();
|
||||
|
||||
if (i >= num_blocks_per_row)
|
||||
@@ -40,8 +41,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
|
||||
|
||||
if (all_threads) {
|
||||
barrier();
|
||||
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||
sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||
barrier();
|
||||
}
|
||||
|
||||
@@ -59,14 +59,14 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum = fma(FLOAT_TYPE( b0[l]) * sccache[ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
|
||||
fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
|
||||
fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
|
||||
fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
|
||||
fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
|
||||
fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
|
||||
fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
|
||||
fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
|
||||
sum = fma(FLOAT_TYPE( b0[l]) * sccache[csel][ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
|
||||
fma(FLOAT_TYPE( b16[l]) * sccache[csel][ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
|
||||
fma(FLOAT_TYPE( b32[l]) * sccache[csel][ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
|
||||
fma(FLOAT_TYPE( b48[l]) * sccache[csel][ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
|
||||
fma(FLOAT_TYPE( b64[l]) * sccache[csel][ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
|
||||
fma(FLOAT_TYPE( b80[l]) * sccache[csel][ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
|
||||
fma(FLOAT_TYPE( b96[l]) * sccache[csel][ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
|
||||
fma(FLOAT_TYPE(b112[l]) * sccache[csel][ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
|
||||
}
|
||||
temp[j][n] = fma(d, sum, temp[j][n]);
|
||||
}
|
||||
|
||||
@@ -6,20 +6,21 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
|
||||
shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][16];
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
uint csel = 0;
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
csel ^= 1;
|
||||
|
||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||
barrier();
|
||||
if (i < num_blocks_per_row)
|
||||
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||
barrier();
|
||||
|
||||
if (i >= num_blocks_per_row)
|
||||
@@ -51,8 +52,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
|
||||
|
||||
if (all_threads) {
|
||||
barrier();
|
||||
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||
barrier();
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
|
||||
sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
|
||||
}
|
||||
temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]);
|
||||
temp[j][n] = fma(fma(sum[0], sccache[csel][ix][s_offset], fma(sum[1], sccache[csel][ix][s_offset + 2], fma(sum[2], sccache[csel][ix][s_offset + 4], sum[3] * sccache[csel][ix][s_offset + 6]))), d, temp[j][n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -777,7 +777,7 @@ void main() {
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
[[unroll]] for (uint col = 0; col < BN; col += storestride) {
|
||||
[[unroll]] for (uint col = 0; col < TN; col += storestride) {
|
||||
const uint row_i = dc + cm_col * TN + col + store_c;
|
||||
if (row_i >= _ne1) break;
|
||||
|
||||
|
||||
@@ -253,6 +253,7 @@ class MODEL_ARCH(IntEnum):
|
||||
MINICPM3 = auto()
|
||||
GEMMA = auto()
|
||||
GEMMA2 = auto()
|
||||
GEMMA3 = auto()
|
||||
STARCODER2 = auto()
|
||||
RWKV6 = auto()
|
||||
RWKV6QWEN2 = auto()
|
||||
@@ -440,6 +441,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.MINICPM3: "minicpm3",
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.GEMMA3: "gemma3",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.RWKV6: "rwkv6",
|
||||
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
|
||||
@@ -1077,6 +1079,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_PRE_NORM,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.GEMMA3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_PRE_NORM,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.STARCODER2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -36,6 +36,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
||||
{ LLM_ARCH_GEMMA, "gemma" },
|
||||
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||
{ LLM_ARCH_GEMMA3, "gemma3" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
{ LLM_ARCH_XVERSE, "xverse" },
|
||||
@@ -766,6 +767,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GEMMA3,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_STARCODER2,
|
||||
{
|
||||
|
||||
@@ -40,6 +40,7 @@ enum llm_arch {
|
||||
LLM_ARCH_MINICPM3,
|
||||
LLM_ARCH_GEMMA,
|
||||
LLM_ARCH_GEMMA2,
|
||||
LLM_ARCH_GEMMA3,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_MAMBA,
|
||||
LLM_ARCH_XVERSE,
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
@@ -864,6 +865,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 26: type = LLM_TYPE_1B; break;
|
||||
case 34: type = LLM_TYPE_4B; break;
|
||||
case 48: type = LLM_TYPE_12B; break;
|
||||
case 62: type = LLM_TYPE_27B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
hparams.f_attention_scale = type == LLM_TYPE_27B
|
||||
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
||||
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -2454,6 +2472,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -3650,6 +3697,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
||||
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
|
||||
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||
@@ -3923,6 +3971,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_PHIMOE:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
case LLM_ARCH_OPENELM:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
|
||||
+147
@@ -4978,6 +4978,149 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gemma3() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||
|
||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||
if (ubatch.token) {
|
||||
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
}
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
// gemma3 requires different mask for layers using sliding window (SWA)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
|
||||
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
|
||||
|
||||
// "5-to-1 interleaved attention"
|
||||
// 5 layers of local attention followed by 1 layer of global attention
|
||||
static const int sliding_window_pattern = 6;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const bool is_sliding = (il + 1) % sliding_window_pattern;
|
||||
const float freq_base_l = is_sliding ? 10000.0f : freq_base;
|
||||
const float freq_scale_l = is_sliding ? 1.0f : freq_scale;
|
||||
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens);
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
||||
model.layers[il].attn_q_norm,
|
||||
NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens);
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
||||
model.layers[il].attn_k_norm,
|
||||
NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.layers[il].attn_post_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||
cb(sa_out, "sa_out", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = llm_build_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.layers[il].ffn_post_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "ffn_post_norm", -1);
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_starcoder2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||
@@ -8298,6 +8441,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_gemma2();
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
result = llm.build_gemma3();
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
result = llm.build_starcoder2();
|
||||
|
||||
@@ -4113,7 +4113,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (int n_mats : {4, 8}) {
|
||||
for (int n_used : {1, 2, 4}) {
|
||||
for (bool b : {false, true}) {
|
||||
for (int n : {1, 32}) {
|
||||
for (int n : {1, 32, 129}) {
|
||||
int m = 512;
|
||||
int k = 256;
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
||||
|
||||
Reference in New Issue
Block a user