Compare commits

...

13 Commits

Author SHA1 Message Date
Akarshan Biswas 1e333d5bba SYCL: Disable reorder optimize by default and stop setting tensor extras when optimize is disabled (#13254)
* SYCL: Do not set tensor extras when reorder optimize is disabled

* SYCL: Disable reorder optimize by default
2025-05-06 20:27:06 +05:30
Xuan-Son Nguyen 2f54e348ad llama : fix build_ffn without gate (#13336)
* llama : fix build_ffn without gate

* fix build on windows

* Revert "fix build on windows"

This reverts commit fc420d3c7e.
2025-05-06 14:25:40 +02:00
Johannes Gäßler 2356fb1d53 CUDA: fix bad asserts for partial offload (#13337) 2025-05-06 13:58:51 +02:00
Sigbjørn Skjæret 764b85627b convert : qwen2/3moe : set yarn metadata if present (#13331)
* set yarn metadata if present

* add comment about enabling YaRN

Co-authored-by: Xuan-Son Nguyen <son@huggingface.co>

---------

Co-authored-by: Xuan-Son Nguyen <son@huggingface.co>
2025-05-06 11:12:06 +02:00
Johannes Gäßler 15a28ec8c7 CUDA: fix --split-mode row for MMQ (#13323) 2025-05-06 08:36:46 +02:00
compilade a7366faa5b gguf-py : avoid requiring pyside6 for other scripts (#13036)
- gguf-py : remove gguf-py/gguf/scripts/__init__.py because it's not needed

Implicit namespaces are supported since Python 3.3 (https://peps.python.org/pep-0420/),
and the entrypoints in pyproject.toml can directly refer to the main functions.
2025-05-05 22:27:31 -04:00
Johannes Gäßler 9070365020 CUDA: fix logic for clearing padding with -ngl 0 (#13320) 2025-05-05 22:32:13 +02:00
oobabooga 233461f812 sampling : Integrate Top-nσ into main sampling chain (and add it to the server) (#13264)
* sampling: add Top-nσ sampler to `llama-server` and sampler ordering

* revert: sampler ordering

* revert: VS' crappy auto-formatting

* revert: VS' crappy auto-formatting pt.2

* revert: my crappy eye sight...

* sampling: add XTC to Top-nσ sampler chain

* sampling: add Dyna. Temp. to Top-nσ sampler chain

* sampling: actually remove Top-nσ from sampler(oops)

* Integrate top_n_sigma into main sampler chain

* Define COMMON_SAMPLER_TYPE_TOP_N_SIGMA

* Formatting

* Lint

* Exit early in the sampler if nsigma < 0

---------

Co-authored-by: CasualAutopsy <casual_autopsy@outlook.com>
2025-05-05 22:12:19 +02:00
igardev b34c859146 server : Webui - change setText command from parent window to also send the message. (#13309)
* setText command from parent window for llama-vscode now sends the message automatically.

* Upgrade packages versions to fix vulnerabilities with "npm audit fix" command.

* Fix code formatting.

* Add index.html.gz changes.

* Revert "Upgrade packages versions to fix vulnerabilities with "npm audit fix" command."

This reverts commit 67687b7fda.

* easier approach

* add setTimeout

---------

Co-authored-by: igardev <ivailo.gardev@akros.ch>
Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2025-05-05 16:03:31 +02:00
Xuan-Son Nguyen 9b61acf060 mtmd : rename llava directory to mtmd (#13311)
* mv llava to mtmd

* change ref everywhere
2025-05-05 16:02:55 +02:00
Xuan-Son Nguyen 5215b91e93 clip : fix confused naming ffn_up and ffn_down (#13290)
* clip :  fix confused naming ffn_up and ffn_down

* rm ffn_i/o/g naming

* rename n_embd, n_ff

* small fix

* no check n_ff
2025-05-05 12:54:44 +02:00
Sigbjørn Skjæret ae803bfc3d convert : bailingmoe : set yarn metadata if present (#13312) 2025-05-05 12:34:26 +02:00
Akarshan Biswas 66645a5285 SYCL: Disable mul_mat kernels for noncontiguous tensor b (#13308)
ggml-ci
2025-05-05 13:39:10 +05:30
61 changed files with 300 additions and 226 deletions
+19 -19
View File
@@ -1394,36 +1394,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
libllava.a: tools/llava/llava.cpp \
tools/llava/llava.h \
tools/llava/clip.cpp \
tools/llava/clip.h \
libllava.a: tools/mtmd/llava.cpp \
tools/mtmd/llava.h \
tools/mtmd/clip.cpp \
tools/mtmd/clip.h \
common/stb_image.h \
common/base64.hpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llama-llava-cli: tools/llava/llava-cli.cpp \
tools/llava/llava.cpp \
tools/llava/llava.h \
tools/llava/clip.cpp \
tools/llava/clip.h \
llama-llava-cli: tools/mtmd/llava-cli.cpp \
tools/mtmd/llava.cpp \
tools/mtmd/llava.h \
tools/mtmd/clip.cpp \
tools/mtmd/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-minicpmv-cli: tools/llava/minicpmv-cli.cpp \
tools/llava/llava.cpp \
tools/llava/llava.h \
tools/llava/clip.cpp \
tools/llava/clip.h \
llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
tools/mtmd/llava.cpp \
tools/mtmd/llava.h \
tools/mtmd/clip.cpp \
tools/mtmd/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-qwen2vl-cli: tools/llava/qwen2vl-cli.cpp \
tools/llava/llava.cpp \
tools/llava/llava.h \
tools/llava/clip.cpp \
tools/llava/clip.h \
llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
tools/mtmd/llava.cpp \
tools/mtmd/llava.h \
tools/mtmd/clip.cpp \
tools/mtmd/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+2 -2
View File
@@ -2211,14 +2211,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
add_opt(common_arg(
{"--mmproj"}, "FILE",
"path to a multimodal projector file. see tools/llava/README.md",
"path to a multimodal projector file. see tools/mtmd/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.path = value;
}
).set_examples(mmproj_examples));
add_opt(common_arg(
{"--mmproj-url"}, "URL",
"URL to a multimodal projector file. see tools/llava/README.md",
"URL to a multimodal projector file. see tools/mtmd/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.url = value;
}
+3 -1
View File
@@ -96,6 +96,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_XTC = 8,
COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
};
// dimensionality reduction methods, used by cvector-generator
@@ -161,6 +162,7 @@ struct common_params_sampling {
std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TYPICAL_P,
COMMON_SAMPLER_TYPE_TOP_P,
@@ -340,7 +342,7 @@ struct common_params {
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see tools/llava)
// multimodal models (see tools/mtmd)
struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
+46 -44
View File
@@ -229,51 +229,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
params.logit_bias.data()));
if (params.mirostat == 0) {
if (params.top_n_sigma >= 0) {
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
} else {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY:
{
std::vector<const char *> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str());
}
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY:
{
std::vector<const char *> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str());
}
break;
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
}
break;
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
break;
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
}
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -475,6 +472,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
case COMMON_SAMPLER_TYPE_XTC: return 'x';
@@ -490,6 +488,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
@@ -504,6 +503,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "dry", COMMON_SAMPLER_TYPE_DRY },
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@@ -517,6 +517,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -552,6 +553,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
+19 -1
View File
@@ -1778,6 +1778,12 @@ class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# fix for SmolVLM2, missing `num_attention_heads` in config.json
if self.hf_arch == "VLlama3ForCausalLM":
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
@@ -2755,6 +2761,13 @@ class Qwen2MoeModel(TextModel):
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
# YaRN is not enabled by default
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
_experts: list[dict[str, Tensor]] | None = None
@@ -5680,7 +5693,12 @@ class BailingMoeModel(TextModel):
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
if (self.hparams.get("rope_scaling") or {}).get("type") == "yarn" and "factor" in self.hparams["rope_scaling"]:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
else:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+6 -6
View File
@@ -33,13 +33,13 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./tools/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
python ./tools/mtmd/llava_surgery.py -m path/to/MobileVLM-1.7B
```
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
```sh
python ./tools/llava/convert_image_encoder_to_gguf.py \
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./tools/llava/convert_image_encoder_to_gguf.py \
```
```sh
python ./tools/llava/convert_image_encoder_to_gguf.py \
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
--output-dir path/to/MobileVLM-1.7B_V2 \
@@ -69,10 +69,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo
## Android compile and run
### compile
refer to `tools/llava/android/build_64.sh`
refer to `tools/mtmd/android/build_64.sh`
```sh
mkdir tools/llava/android/build_64
cd tools/llava/android/build_64
mkdir tools/mtmd/android/build_64
cd tools/mtmd/android/build_64
../build_64.sh
```
### run on Android
+2 -2
View File
@@ -25,13 +25,13 @@ git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/T
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
```sh
python ./tools/llava/glmedge-surgery.py -m ../model_path
python ./tools/mtmd/glmedge-surgery.py -m ../model_path
```
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
```sh
python ./tools/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
python ./tools/mtmd/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
```
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
+6 -6
View File
@@ -37,19 +37,19 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
2. Install the required Python packages:
```sh
pip install -r tools/llava/requirements.txt
pip install -r tools/mtmd/requirements.txt
```
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./tools/llava/llava_surgery.py -m ../llava-v1.5-7b
python ./tools/mtmd/llava_surgery.py -m ../llava-v1.5-7b
```
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
```sh
python ./tools/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
```
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
@@ -69,12 +69,12 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
2) Install the required Python packages:
```sh
pip install -r tools/llava/requirements.txt
pip install -r tools/mtmd/requirements.txt
```
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console
python tools/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
python tools/mtmd/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
```
- you will find a llava.projector and a llava.clip file in your model directory
@@ -88,7 +88,7 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
5) Create the visual gguf model:
```console
python ./tools/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
+2 -2
View File
@@ -29,8 +29,8 @@ cmake --build build --config Release
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
```bash
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
# quantize int4 version
+2 -2
View File
@@ -28,8 +28,8 @@ cmake --build build --config Release
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version
+2 -2
View File
@@ -28,8 +28,8 @@ cmake --build build --config Release
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
```bash
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version
+2 -2
View File
@@ -38,7 +38,7 @@ extern "C" {
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
@@ -59,7 +59,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+4
View File
@@ -673,11 +673,15 @@ extern "C" {
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
+2 -2
View File
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
return SIZE_MAX;
}
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) {
size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
}
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
}
+2
View File
@@ -719,6 +719,7 @@ void launch_fattn(
size_t nb23 = V->nb[3];
if (need_f16_K && K->type != GGML_TYPE_F16) {
GGML_ASSERT(ggml_is_contiguously_allocated(K));
K_f16.alloc(ggml_nelements(K));
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
@@ -733,6 +734,7 @@ void launch_fattn(
}
if (need_f16_V && V->type != GGML_TYPE_F16) {
GGML_ASSERT(ggml_is_contiguously_allocated(V));
V_f16.alloc(ggml_nelements(V));
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
+14 -5
View File
@@ -555,8 +555,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
// initialize padding to 0 to avoid possible NaN values
size_t original_size = ggml_nbytes(tensor);
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
const size_t original_size = ggml_nbytes(tensor);
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
if (padded_size > original_size) {
ggml_cuda_set_device(ctx->device);
@@ -679,6 +679,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
if (ggml_is_quantized(tensor->type)) {
if (ne0 % MATRIX_ROW_PADDING != 0) {
GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
}
@@ -800,6 +801,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
@@ -851,6 +853,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
// split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
@@ -889,6 +892,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
// split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
@@ -970,6 +974,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
size_t total_size = 0;
@@ -1531,6 +1536,8 @@ static void ggml_cuda_op_mul_mat(
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
@@ -2062,9 +2069,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
ggml_tensor src0_slice = *src0;
src0_slice.ne[2] = 1;
src0_slice.nb[3] = src0_slice.nb[2];
src0_slice.data = (char *) src0->data + i02*nb02;
src0_slice.ne[2] = 1;
src0_slice.nb[3] = src0_slice.nb[2];
src0_slice.op = GGML_OP_VIEW;
src0_slice.view_src = dst->src[0]; // non-const pointer to src0
src0_slice.data = (char *) src0->data + i02*nb02;
ggml_tensor src1_slice;
memset(&src1_slice, 0, sizeof(src1_slice));
+14 -3
View File
@@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
const size_t size_data = ggml_nbytes(src0);
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
if (size_alloc > size_data) {
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
}
}
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
const int64_t s01 = src0->nb[1] / ts_src0;
@@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
const mmq_args args = {
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
ne00, ne01, ne1, s01, s1,
ne00, ne01, ne1, s01, ne11, s1,
ne02, ne12, s02, s12, s2,
ne03, ne13, s03, s13, s3,
use_stream_k};
@@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
const mmq_args args = {
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
ne00, ne01, ne_get_rows, s01, s1,
ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
ne02, ne02, s02, s12, s2,
ne03, ne13, s03, s13, s3,
use_stream_k};
@@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
const mmq_args args = {
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
ne00, row_diff, src1_ncols, stride01, nrows_dst,
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
1, 1, 0, 0, 0,
1, 1, 0, 0, 0,
use_stream_k};
+27 -27
View File
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
static __device__ __forceinline__ void mul_mat_q_process_tile(
const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
const int nrows_x, const int stride_row_x, const int ncols_y, const int stride_col_dst,
const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
static __global__ void mul_mat_q(
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int mmq_y = get_mmq_y_device();
const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
// Initialize the ids for writing back data with just the index.
// For regular matrix multiplications this is never changed.
@@ -2648,8 +2648,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int col_high = ncols_dst;
int col_diff = ncols_dst;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
@@ -2689,7 +2689,7 @@ static __global__ void mul_mat_q(
constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
return;
}
@@ -2720,8 +2720,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int col_high = ncols_dst;
int col_diff = ncols_dst;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
@@ -2767,7 +2767,7 @@ static __global__ void mul_mat_q(
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
kbc += blocks_per_ne00;
@@ -2792,8 +2792,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int col_high = ncols_dst;
int col_diff = ncols_dst;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
@@ -2834,7 +2834,7 @@ static __global__ void mul_mat_q(
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
}
@@ -2842,7 +2842,7 @@ static __global__ void mul_mat_q(
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
static __global__ void mul_mat_q_stream_k_fixup(
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst,
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
constexpr int mmq_y = get_mmq_y_device();
constexpr int qk = ggml_cuda_type_traits<type>::qk;
@@ -2851,8 +2851,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
const int ntx = (ncols_y + mmq_x - 1) / mmq_x;
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x;
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int bidx0 = blockIdx.x;
@@ -2925,8 +2925,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
dst += offset_dst;
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = ncols_y - jt*mmq_x - 1;
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = ncols_dst - jt*mmq_x - 1;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -2989,7 +2989,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
struct mmq_args {
const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst;
int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
bool use_stream_k;
@@ -3025,8 +3025,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x;
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
const int ntzw = args.nchannels_y * args.nsamples_y;
const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
@@ -3040,14 +3040,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
}
@@ -3068,7 +3068,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
@@ -3077,14 +3077,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
@@ -3093,7 +3093,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
}
}
+11
View File
@@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
float * dst_d = (float *) dst->data;
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
const size_t size_data = ggml_nbytes(src0);
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
if (size_alloc > size_data) {
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
}
}
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
{
+1
View File
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
+5 -2
View File
@@ -193,7 +193,7 @@ static void ggml_check_sycl() try {
if (!initialized) {
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
GGML_LOG_INFO("Running with Environment Variables:\n");
@@ -338,7 +338,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
assert(tensor->view_src->buffer->buft == buffer->buft);
return GGML_STATUS_SUCCESS;
}
if (tensor->type == GGML_TYPE_Q4_0) {
if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
tensor->extra = extra;
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
@@ -3873,6 +3873,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
if (a->ne[3] != b->ne[3]) {
return false;
}
if (!ggml_is_contiguous(b)) {
return false;
}
ggml_type a_type = a->type;
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
+4
View File
@@ -1299,6 +1299,10 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
return ggml_is_contiguous_n(tensor, 2);
}
bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
}
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-7
View File
@@ -1,7 +0,0 @@
# pyright: reportUnusedImport=false
from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
from .gguf_dump import main as gguf_dump_entrypoint
from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
from .gguf_editor_gui import main as gguf_editor_gui_entrypoint
+4 -7
View File
@@ -977,15 +977,12 @@ class TensorNameMap:
"visual.blocks.{bid}.norm2", # qwen2vl
),
# some namings are messed up because the original llava code swapped fc1 and fc2
# we have no better way to fix it, just be careful
# new models like pixtral use the correct naming
MODEL_TENSOR.V_ENC_FFN_UP: (
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped)
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
),
@@ -997,9 +994,9 @@ class TensorNameMap:
MODEL_TENSOR.V_ENC_FFN_DOWN: (
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped)
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
),
+6 -6
View File
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.16.2"
version = "0.16.3"
description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
@@ -36,8 +36,8 @@ requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts]
gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
gguf-editor-gui = "gguf.scripts:gguf_editor_gui_entrypoint"
gguf-convert-endian = "gguf.scripts.gguf_convert_endian:main"
gguf-dump = "gguf.scripts.gguf_dump:main"
gguf-set-metadata = "gguf.scripts.gguf_set_metadata:main"
gguf-new-metadata = "gguf.scripts.gguf_new_metadata:main"
gguf-editor-gui = "gguf.scripts.gguf_editor_gui:main"
+1
View File
@@ -40,5 +40,6 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts]
llama-convert-hf-to-gguf = "convert_hf_to_gguf:main"
llama-convert-lora-to-gguf = "convert_lora_to_gguf:main"
llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main"
llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main"
+1 -1
View File
@@ -1,4 +1,4 @@
-r ../tools/llava/requirements.txt
-r ../tools/mtmd/requirements.txt
-r ../tools/server/bench/requirements.txt
-r ../tools/server/tests/requirements.txt
+1 -1
View File
@@ -782,7 +782,7 @@ ggml_tensor * llm_graph_context::build_ffn(
} break;
}
if (type_gate == LLM_FFN_PAR) {
if (gate && type_gate == LLM_FFN_PAR) {
cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il);
}
+4
View File
@@ -1750,6 +1750,10 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
if (ctx->n < 0.0f) {
return;
}
// find max logit and calculate mean
float max = cur_p->data[0].logit;
float logits_sum = 0;
+1 -1
View File
@@ -27,7 +27,7 @@ else()
add_subdirectory(run)
add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(llava)
add_subdirectory(mtmd)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
+72 -75
View File
@@ -155,8 +155,8 @@ enum patch_merge_type {
struct clip_hparams {
int32_t image_size;
int32_t patch_size;
int32_t hidden_size;
int32_t n_intermediate;
int32_t n_embd;
int32_t n_ff;
int32_t projection_dim;
int32_t n_head;
int32_t n_layer;
@@ -191,12 +191,6 @@ struct clip_layer {
struct ggml_tensor * ln_1_w = nullptr;
struct ggml_tensor * ln_1_b = nullptr;
// ff
struct ggml_tensor * ff_i_w = nullptr; // legacy naming
struct ggml_tensor * ff_i_b = nullptr; // legacy naming
struct ggml_tensor * ff_o_w = nullptr; // legacy naming
struct ggml_tensor * ff_o_b = nullptr; // legacy naming
struct ggml_tensor * ff_up_w = nullptr;
struct ggml_tensor * ff_up_b = nullptr;
struct ggml_tensor * ff_gate_w = nullptr;
@@ -204,9 +198,6 @@ struct clip_layer {
struct ggml_tensor * ff_down_w = nullptr;
struct ggml_tensor * ff_down_b = nullptr;
struct ggml_tensor * ff_g_w = NULL;
struct ggml_tensor * ff_g_b = NULL;
// layernorm 2
struct ggml_tensor * ln_2_w = nullptr;
struct ggml_tensor * ln_2_b = nullptr;
@@ -388,9 +379,9 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int hidden_size = hparams.hidden_size;
const int n_embd = hparams.n_embd;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int d_head = n_embd / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
@@ -411,7 +402,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
ggml_set_input(inp_raw);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
inp = ggml_reshape_2d(ctx0, inp, num_patches, n_embd);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
@@ -456,7 +447,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
cur = ggml_cont_2d(ctx0, KQV, n_embd, num_patches);
}
// attention output
@@ -473,14 +464,14 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
}
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_up_b);
// siglip uses gelu
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_down_b);
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
@@ -504,11 +495,11 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
const int kernel_size = patches_per_image / tokens_per_side;
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, n_embd, batch_size);
// doing a pool2d to reduce the number of output tokens to 256
embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], n_embd, batch_size);
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
// apply norm before projection
@@ -637,9 +628,9 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
const int n_patches_x = image_size_width / patch_size;
const int n_patches_y = image_size_height / patch_size;
const int num_patches = n_patches_x * n_patches_y;
const int hidden_size = hparams.hidden_size;
const int n_embd = hparams.n_embd;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int d_head = n_embd / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
const int n_merge = hparams.spatial_merge_size;
@@ -669,7 +660,7 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
ggml_set_input(pos_w);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
inp = ggml_reshape_2d(ctx0, inp, num_patches, n_embd);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
struct ggml_tensor * embeddings = inp;
@@ -710,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
cur = ggml_cont_2d(ctx0, KQV, n_embd, num_patches);
cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
}
@@ -753,8 +744,8 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
// reshape image tokens to 2D grid
cur = ggml_reshape_3d(ctx0, cur, hidden_size, n_patches_x, n_patches_y);
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, hidden_size]
cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
cur = ggml_cont(ctx0, cur);
// torch.nn.functional.unfold is just an im2col under the hood
@@ -762,7 +753,7 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
// project to hidden_size
// project to n_embd
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
embeddings = cur;
@@ -785,9 +776,9 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
// arrangement of the [IMG_BREAK] token
{
// not efficient, but works
// the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
// after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
// after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
@@ -827,9 +818,9 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
const int hidden_size = hparams.hidden_size;
const int n_embd = hparams.n_embd;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int d_head = n_embd / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
@@ -864,14 +855,14 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, patches_h, batch_size);
n_embd * 2, patches_w / 2, patches_h, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
n_embd * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_3d(
ctx0, inp,
hidden_size, patches_w * patches_h, batch_size);
n_embd, patches_w * patches_h, batch_size);
if (model.patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@@ -904,11 +895,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
// embeddings shape: [n_embd, patches_w * patches_h, batch_size]
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * 4, patches_w * patches_h * batch_size / 4);
embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, patches_w * patches_h, batch_size);
}
// loop over layers
@@ -961,7 +952,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
cur = ggml_cont_3d(ctx0, KQV, n_embd, num_positions, batch_size);
}
// attention output
@@ -978,11 +969,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
// mlp
// ffn_up
auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_up_b);
auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_gate_b);
// TODO : only 2 of these 3 are actually used, should we remove one of them?
if (ctx->use_gelu) {
cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
@@ -994,8 +985,8 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
cur = ggml_mul(ctx0, cur_gate, cur_up);
// ffn_down
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_down_b);
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
@@ -1011,7 +1002,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
}
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@@ -1028,7 +1019,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
ggml_set_name(window_idx, "window_idx");
ggml_set_input(window_idx);
// embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
// embeddings shape: [n_embd, patches_w * patches_h, batch_size]
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
@@ -1074,9 +1065,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
const int hidden_size = hparams.hidden_size;
const int n_embd = hparams.n_embd;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int d_head = n_embd / n_head;
const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
@@ -1114,17 +1105,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, patches_h, batch_size);
n_embd * 2, patches_w / 2, patches_h, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
n_embd * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_3d(
ctx0, inp,
hidden_size, patches_w * patches_h, batch_size);
n_embd, patches_w * patches_h, batch_size);
}
else {
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_reshape_3d(ctx0, inp, num_patches, n_embd, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
}
@@ -1137,7 +1128,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
// concat class_embeddings and patch_embeddings
if (model.class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, num_positions, batch_size);
embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
@@ -1234,7 +1225,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
cur = ggml_cont_3d(ctx0, KQV, n_embd, num_positions, batch_size);
}
// attention output
@@ -1252,8 +1243,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
}
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_up_b);
if (ctx->use_gelu) {
cur = ggml_gelu_inplace(ctx0, cur);
@@ -1263,8 +1254,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
cur = ggml_gelu_quick_inplace(ctx0, cur);
}
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_down_b);
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
@@ -1496,9 +1487,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
}
{ // attention
int hidden_size = clip_n_mmproj_embd(ctx);
int n_embd = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = hidden_size/d_head;
int n_head = n_embd/d_head;
int num_query = 96;
if (ctx->minicpmv_version == 2) {
num_query = 96;
@@ -1528,7 +1519,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
KQV = ggml_cont_3d(ctx0, KQV, n_embd, num_query, batch_size);
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
}
@@ -1571,7 +1562,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
}
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@@ -1696,9 +1687,9 @@ struct clip_model_loader {
get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
get_u32(KEY_N_EMBD, hparams.hidden_size);
get_u32(KEY_N_EMBD, hparams.n_embd);
get_u32(KEY_N_HEAD, hparams.n_head);
get_u32(KEY_N_FF, hparams.n_intermediate);
get_u32(KEY_N_FF, hparams.n_ff);
get_u32(KEY_N_BLOCK, hparams.n_layer);
get_u32(KEY_PROJ_DIM, hparams.projection_dim);
get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
@@ -1807,6 +1798,7 @@ struct clip_model_loader {
}
void load_tensors() {
auto & hparams = ctx_clip.vision_model.hparams;
std::map<std::string, size_t> tensor_offset;
std::vector<ggml_tensor *> tensors_to_load;
@@ -1860,8 +1852,8 @@ struct clip_model_loader {
vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
// layers
vision_model.layers.resize(vision_model.hparams.n_layer);
for (int il = 0; il < vision_model.hparams.n_layer; ++il) {
vision_model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = vision_model.layers[il];
layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight"));
layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
@@ -1884,13 +1876,18 @@ struct clip_model_loader {
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
// legacy naming (the in and out is reversed! don't ask me why)
layer.ff_i_w = layer.ff_down_w;
layer.ff_o_w = layer.ff_up_w;
layer.ff_g_w = layer.ff_gate_w;
layer.ff_i_b = layer.ff_down_b;
layer.ff_o_b = layer.ff_up_b;
layer.ff_g_b = layer.ff_gate_b;
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
// swap up and down weights
ggml_tensor * tmp = layer.ff_up_w;
layer.ff_up_w = layer.ff_down_w;
layer.ff_down_w = tmp;
// swap up and down biases
tmp = layer.ff_up_b;
layer.ff_up_b = layer.ff_down_b;
layer.ff_down_b = tmp;
}
}
switch (ctx_clip.proj_type) {
@@ -2904,7 +2901,7 @@ int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
}
int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.hidden_size;
return ctx->vision_model.hparams.n_embd;
}
const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
@@ -92,6 +92,10 @@ struct mtmd_cli_context {
batch = llama_batch_init(params.n_batch, 0, 1);
n_batch = params.n_batch;
if (!model || !lctx) {
exit(1);
}
if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
LOG_ERR("Model does not have chat template.\n");
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");

Before

Width:  |  Height:  |  Size: 121 KiB

After

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.
+2
View File
@@ -146,6 +146,7 @@ struct slot_params {
{"top_k", sampling.top_k},
{"top_p", sampling.top_p},
{"min_p", sampling.min_p},
{"top_n_sigma", sampling.top_n_sigma},
{"xtc_probability", sampling.xtc_probability},
{"xtc_threshold", sampling.xtc_threshold},
{"typical_p", sampling.typ_p},
@@ -248,6 +249,7 @@ struct server_task {
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
@@ -157,6 +157,9 @@ export default function ChatScreen() {
clearExtraContext();
};
// for vscode context
textarea.refOnSubmit.current = sendNewMessage;
const handleEditMessage = async (msg: Message, content: string) => {
if (!viewingChat) return;
setCurrNodeId(msg.id);
@@ -37,6 +37,7 @@ export interface ChatTextareaApi {
setValue: (value: string) => void;
focus: () => void;
ref: React.RefObject<HTMLTextAreaElement>;
refOnSubmit: React.MutableRefObject<(() => void) | null>; // Submit handler
onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
}
@@ -46,6 +47,7 @@ export interface ChatTextareaApi {
export function useChatTextarea(initValue: string): ChatTextareaApi {
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
const textareaRef = useRef<HTMLTextAreaElement>(null);
const onSubmitRef = useRef<(() => void) | null>(null);
// Effect to set initial value and height on mount or when initValue changes
useEffect(() => {
@@ -91,6 +93,7 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
}
},
ref: textareaRef,
refOnSubmit: onSubmitRef,
onInput: handleInput,
};
}
@@ -33,6 +33,9 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
});
}
textarea.focus();
setTimeout(() => {
textarea.refOnSubmit.current?.();
}, 10); // wait for setExtraContext to finish
}
};