mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
34 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 00fa15fedc | |||
| 946b1f6859 | |||
| 6c6e397aff | |||
| afc0e89698 | |||
| a5771c9eea | |||
| c35f9eaf09 | |||
| 1f45f2890e | |||
| 613c5095c3 | |||
| 7f97599581 | |||
| bf78f5439e | |||
| bbfc849274 | |||
| ca0ef2dddb | |||
| 89d1029559 | |||
| f1a4e72de5 | |||
| 4762ad7316 | |||
| 1dc9614e06 | |||
| 446595b9b3 | |||
| 66906cd82a | |||
| 11dd5a44eb | |||
| 9b8f3c6c77 | |||
| c7f3169cd5 | |||
| 793c0d7f46 | |||
| ce111d39d6 | |||
| e7fecba934 | |||
| e2b7621e7c | |||
| c1dbea752a | |||
| 749e0d27f0 | |||
| 64bf1c3744 | |||
| c12bbde372 | |||
| 3f4fc97f1d | |||
| 2df255da3c | |||
| 60f816a79d | |||
| 5592f278b6 | |||
| e4868d16d2 |
@@ -1,10 +1,10 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc4.0.1
|
||||
ARG MUSA_VERSION=rc4.2.0
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=6.3
|
||||
ARG AMDGPU_VERSION=6.3
|
||||
ARG ROCM_VERSION=6.4
|
||||
ARG AMDGPU_VERSION=6.4
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
@@ -515,7 +515,7 @@ jobs:
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
@@ -82,6 +82,7 @@ models/*
|
||||
models-mnt
|
||||
!models/.editorconfig
|
||||
!models/ggml-vocab-*.gguf*
|
||||
!models/templates
|
||||
|
||||
# Zig
|
||||
zig-out/
|
||||
|
||||
+1
-1
@@ -54,7 +54,7 @@ docker run --privileged -it \
|
||||
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
||||
-v $HOME/llama.cpp/ci-results:/ci-results \
|
||||
-v $PWD:/ws -w /ws \
|
||||
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
```
|
||||
|
||||
Inside the container, execute the following commands:
|
||||
|
||||
+155
-7
@@ -1900,6 +1900,7 @@ class StableLMModel(TextModel):
|
||||
"MixtralForCausalLM",
|
||||
"VLlama3ForCausalLM",
|
||||
"LlavaForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
"LlamaModel")
|
||||
class LlamaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
@@ -1912,6 +1913,11 @@ class LlamaModel(TextModel):
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
|
||||
def set_vocab(self):
|
||||
path_tekken_json = self.dir_model / "tekken.json"
|
||||
path_tokenizer_json = self.dir_model / "tokenizer.json"
|
||||
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
|
||||
return self.set_vocab_tekken()
|
||||
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
@@ -1944,6 +1950,52 @@ class LlamaModel(TextModel):
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
def set_vocab_tekken(self):
|
||||
vocab = gguf.vocab.MistralVocab(self.dir_model)
|
||||
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
|
||||
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
for text, score, toktype in vocab.all_tokens():
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
assert len(tokens) == vocab.vocab_size, (
|
||||
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
|
||||
)
|
||||
|
||||
if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
|
||||
self.gguf_writer.add_tokenizer_pre("tekken")
|
||||
self.gguf_writer.add_token_merges(
|
||||
vocab.extract_vocab_merges_from_model()
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
|
||||
)
|
||||
|
||||
self.gguf_writer.add_bos_token_id(vocab.bos_id)
|
||||
self.gguf_writer.add_eos_token_id(vocab.eos_id)
|
||||
self.gguf_writer.add_unk_token_id(vocab.unk_id)
|
||||
self.gguf_writer.add_pad_token_id(vocab.pad_id)
|
||||
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_vocab_size(vocab.vocab_size)
|
||||
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(False)
|
||||
|
||||
script_dir = Path(__file__).parent
|
||||
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
|
||||
with open(template_path, "r", encoding="utf-8") as f:
|
||||
template = f.read()
|
||||
self.gguf_writer.add_chat_template(template)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
@@ -1971,12 +2023,13 @@ class LlamaModel(TextModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
is_vision_tensor = "vision_tower" in name \
|
||||
is_multimodal_tensor = "vision_tower" in name \
|
||||
or "vision_model" in name \
|
||||
or "audio_tower" in name \
|
||||
or "model.connector" in name \
|
||||
or "multi_modal_projector" in name
|
||||
|
||||
if is_vision_tensor:
|
||||
if is_multimodal_tensor:
|
||||
return [] # skip vision tensors
|
||||
elif self.hf_arch == "LlamaModel":
|
||||
name = "model." + name
|
||||
@@ -3791,7 +3844,7 @@ class Plamo2Model(TextModel):
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
|
||||
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
|
||||
|
||||
# Mamba parameters
|
||||
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
||||
@@ -3802,7 +3855,7 @@ class Plamo2Model(TextModel):
|
||||
self.gguf_writer.add_ssm_group_count(0)
|
||||
|
||||
# MLP feed forward parameters (for attention layers)
|
||||
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
|
||||
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
@@ -7231,9 +7284,10 @@ class WhisperEncoderModel(MmprojModel):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
|
||||
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
@@ -7272,9 +7326,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
||||
|
||||
|
||||
@ModelBase.register("VoxtralForConditionalGeneration")
|
||||
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
|
||||
has_vision_encoder = False # no vision encoder
|
||||
has_audio_encoder = True
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
|
||||
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
|
||||
|
||||
|
||||
@ModelBase.register("FalconH1ForCausalLM")
|
||||
class FalconH1Model(Mamba2Model):
|
||||
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
||||
@@ -7589,6 +7655,88 @@ class LFM2Model(TextModel):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("SmallThinkerForCausalLM")
|
||||
class SmallThinkerModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMALLTHINKER
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
|
||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||
self.gguf_writer.add_feed_forward_length(moe_intermediate_size)
|
||||
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
||||
if (self.hparams.get('moe_primary_router_apply_softmax')):
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
||||
else:
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
# YaRN is not enabled by default
|
||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
sliding_window_layout = self.hparams.get("sliding_window_layout")
|
||||
if sliding_window_layout:
|
||||
for i in sliding_window_layout:
|
||||
if i != 0:
|
||||
sliding_window = self.hparams.get("sliding_window_size")
|
||||
if sliding_window:
|
||||
self.gguf_writer.add_sliding_window(sliding_window)
|
||||
break
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# process the experts separately
|
||||
if name.find("experts") != -1:
|
||||
n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down", "gate", "up"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
+38
-8
@@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc)
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
- By default, NNPA is enabled when available. To disable it (not recommended):
|
||||
- By default, NNPA is disabled by default. To enable it:
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||
-DGGML_NNPA=OFF
|
||||
-DGGML_NNPA=ON
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
@@ -84,9 +84,9 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||

|
||||
|
||||
You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
|
||||
You can find popular models pre-converted and verified at [s390x Verified Models](https://huggingface.co/collections/taronaeo/s390x-verified-models-672765393af438d0ccb72a08) or [s390x Runnable Models](https://huggingface.co/collections/taronaeo/s390x-runnable-models-686e951824198df12416017e).
|
||||
|
||||
These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
|
||||
These models have already been converted from `safetensors` to `GGUF` Big-Endian and their respective tokenizers verified to run correctly on IBM z15 and later system.
|
||||
|
||||
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
|
||||
|
||||
@@ -94,6 +94,14 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||
The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
|
||||
|
||||
Ensure that you have installed the required packages in advance
|
||||
|
||||
```bash
|
||||
pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
Convert the `safetensors` model to `GGUF`
|
||||
|
||||
```bash
|
||||
python3 convert_hf_to_gguf.py \
|
||||
--outfile model-name-be.f16.gguf \
|
||||
@@ -116,7 +124,7 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||

|
||||
|
||||
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
|
||||
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B GGUF](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
|
||||
|
||||
```bash
|
||||
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
|
||||
@@ -141,15 +149,15 @@ Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by
|
||||
|
||||
### 2. NNPA Vector Intrinsics Acceleration
|
||||
|
||||
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 3. zDNN Accelerator
|
||||
|
||||
_Only available in IBM z16 or later system. No direction at the moment._
|
||||
_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._
|
||||
|
||||
### 4. Spyre Accelerator
|
||||
|
||||
_No direction at the moment._
|
||||
_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
@@ -189,6 +197,26 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|
||||
Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
|
||||
|
||||
4. Failing to install the `sentencepiece` package using GCC 15+
|
||||
|
||||
Answer: The `sentencepiece` team are aware of this as seen in [this issue](https://github.com/google/sentencepiece/issues/1108).
|
||||
|
||||
As a temporary workaround, please run the installation command with the following environment variables.
|
||||
|
||||
```bash
|
||||
export CXXFLAGS="-include cstdint"
|
||||
```
|
||||
|
||||
For example,
|
||||
|
||||
```bash
|
||||
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
5. `-DGGML_NNPA=ON` generates gibberish output
|
||||
|
||||
Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
|
||||
|
||||
## Getting Help on IBM Z & LinuxONE
|
||||
|
||||
1. **Bugs, Feature Requests**
|
||||
@@ -244,3 +272,5 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
- ✅ - acceleration available
|
||||
- 🚫 - acceleration unavailable, will still run using scalar implementation
|
||||
- ❓ - acceleration unknown, please contribute if you can test it yourself
|
||||
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 25, 2025.
|
||||
|
||||
@@ -23,11 +23,19 @@ The convert script reads the model configuration, tokenizer, tensor names+data a
|
||||
|
||||
The required steps to implement for an HF model are:
|
||||
|
||||
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
|
||||
1. Define the model `ModelBase.register` annotation in a new `TextModel` or `MmprojModel` subclass, example:
|
||||
|
||||
```python
|
||||
@Model.register("MyModelForCausalLM")
|
||||
class MyModel(Model):
|
||||
@ModelBase.register("MyModelForCausalLM")
|
||||
class MyModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MYMODEL
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```python
|
||||
@ModelBase.register("MyModelForConditionalGeneration")
|
||||
class MyModel(MmprojModel):
|
||||
model_arch = gguf.MODEL_ARCH.MYMODEL
|
||||
```
|
||||
|
||||
@@ -75,9 +83,10 @@ block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
|
||||
|
||||
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
|
||||
- `Model#set_gguf_parameters`
|
||||
- `Model#set_vocab`
|
||||
- `Model#write_tensors`
|
||||
- `TextModel#set_gguf_parameters`
|
||||
- `MmprojModel#set_gguf_parameters`
|
||||
- `ModelBase#set_vocab`
|
||||
- `ModelBase#modify_tensors`
|
||||
|
||||
NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||
|
||||
|
||||
+1
-1
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
|
||||
|
||||
The defaults are:
|
||||
|
||||
- `MUSA_VERSION` set to `rc4.0.1`
|
||||
- `MUSA_VERSION` set to `rc4.2.0`
|
||||
|
||||
The resulting images, are essentially the same as the non-MUSA images:
|
||||
|
||||
|
||||
@@ -97,6 +97,9 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
# Qwen2-Audio and SeaLLM-Audio
|
||||
# note: no pre-quantized GGUF this model, as they have very poor result
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
|
||||
|
||||
# Mistral's Voxtral
|
||||
(tool_name) -hf ggml-org/Voxtral-Mini-3B-2507-GGUF
|
||||
```
|
||||
|
||||
**Mixed modalities**:
|
||||
|
||||
+93
-86
@@ -2,94 +2,101 @@
|
||||
|
||||
List of GGML operations and backend support status.
|
||||
|
||||
## How to add a backend to this table:
|
||||
|
||||
1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)
|
||||
2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`
|
||||
|
||||
Legend:
|
||||
- ✅ Fully supported by this backend
|
||||
- 🟡 Partially supported by this backend
|
||||
- ❌ Not supported by this backend
|
||||
|
||||
| Operation | BLAS | CPU | CUDA | Metal |
|
||||
|-----------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | 🟡 |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DIV | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DUP | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| ELU | ❌ | ✅ | ❌ | 🟡 |
|
||||
| EXP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
|
||||
| GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | 🟡 |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ |
|
||||
| MUL | ❌ | ✅ | ✅ | 🟡 |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
|
||||
| NEG | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
|
||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
|
||||
| POOL_2D | ❌ | ✅ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
|
||||
| SCALE | ❌ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ✅ | ❌ | ✅ |
|
||||
| SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
|
||||
| SGN | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
|
||||
| SQR | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ |
|
||||
| SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
|
||||
| STEP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SUM | ❌ | ✅ | ✅ | ❌ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | ✅ | ✅ | 🟡 |
|
||||
| Operation | BLAS | CPU | CUDA | Metal | SYCL | Vulkan |
|
||||
|-----------|------|------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | 🟡 | ✅ | 🟡 | ✅ |
|
||||
| CONT | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 |
|
||||
| CONV_2D | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ |
|
||||
| DIV | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ |
|
||||
| DUP | ❌ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| ELU | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| EXP | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | ✅ | 🟡 | ✅ | 🟡 | 🟡 |
|
||||
| GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ |
|
||||
| NEG | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ |
|
||||
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROLL | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SCALE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SGN | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
| SIN | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ | ❌ | ✅ |
|
||||
| SQR | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ |
|
||||
| SUM | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
|
||||
|
||||
+8133
-6534
File diff suppressed because it is too large
Load Diff
+7349
-6534
File diff suppressed because it is too large
Load Diff
+7349
-6534
File diff suppressed because it is too large
Load Diff
+8133
-6534
File diff suppressed because it is too large
Load Diff
+8133
File diff suppressed because it is too large
Load Diff
+8133
File diff suppressed because it is too large
Load Diff
+3
-1
@@ -131,7 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
option(GGML_NNPA "ggml: enable nnpa" ON)
|
||||
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
||||
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
|
||||
+125
-88
@@ -1,152 +1,189 @@
|
||||
@PACKAGE_INIT@
|
||||
|
||||
@GGML_VARIABLES_EXPANDED@
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
||||
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
find_library(GGML_LIBRARY ggml
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
add_library(ggml::ggml UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_LIBRARY}")
|
||||
|
||||
find_library(GGML_BASE_LIBRARY ggml-base
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
|
||||
# Find all dependencies before creating any target.
|
||||
include(CMakeFindDependencyMacro)
|
||||
find_dependency(Threads)
|
||||
if (NOT GGML_SHARED_LIB)
|
||||
set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
|
||||
set(GGML_CPU_INTERFACE_LINK_OPTIONS "")
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if(NOT ACCELERATE_FRAMEWORK)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
||||
return()
|
||||
endif()
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP REQUIRED)
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
find_dependency(OpenMP)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind REQUIRED)
|
||||
find_library(memkind memkind)
|
||||
if(NOT memkind)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
||||
return()
|
||||
endif()
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
|
||||
endif()
|
||||
|
||||
if (GGML_BLAS)
|
||||
find_package(BLAS REQUIRED)
|
||||
find_dependency(BLAS)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
set(GGML_CUDA_INTERFACE_LINK_LIBRARIES "")
|
||||
find_dependency(CUDAToolkit)
|
||||
if (GGML_STATIC)
|
||||
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cudart_static>)
|
||||
if (WIN32)
|
||||
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas> $<LINK_ONLY:CUDA::cublasLt>)
|
||||
else()
|
||||
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas_static> $<LINK_ONLY:CUDA::cublasLt_static>)
|
||||
endif()
|
||||
endif()
|
||||
if (NOT GGML_CUDA_NO_VMM)
|
||||
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cuda_driver>)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
find_library(FOUNDATION_LIBRARY Foundation)
|
||||
find_library(METAL_FRAMEWORK Metal)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit)
|
||||
if(NOT FOUNDATION_LIBRARY OR NOT METAL_FRAMEWORK OR NOT METALKIT_FRAMEWORK)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
||||
return()
|
||||
endif()
|
||||
set(GGML_METAL_INTERFACE_LINK_LIBRARIES
|
||||
${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
|
||||
${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
||||
if (GGML_OPENCL)
|
||||
find_dependency(OpenCL)
|
||||
set(GGML_OPENCL_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:OpenCL::OpenCL>)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN)
|
||||
find_package(Vulkan REQUIRED)
|
||||
list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
|
||||
find_dependency(Vulkan)
|
||||
set(GGML_VULKAN_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:Vulkan::Vulkan>)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
|
||||
find_dependency(hip)
|
||||
find_dependency(hipblas)
|
||||
find_dependency(rocblas)
|
||||
set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "")
|
||||
find_package(DNNL)
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
|
||||
endif()
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
find_dependency(IntelSYCL)
|
||||
find_dependency(MKL)
|
||||
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
||||
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
if(NOT TARGET ggml::ggml)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
find_library(GGML_LIBRARY ggml
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
add_library(ggml::ggml UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
IMPORTED_LOCATION "${GGML_LIBRARY}")
|
||||
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
find_library(GGML_BASE_LIBRARY ggml-base
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set(_ggml_all_targets "")
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
|
||||
list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}")
|
||||
list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
add_library(ggml::all INTERFACE IMPORTED)
|
||||
set_target_properties(ggml::all
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
|
||||
add_library(ggml::all INTERFACE IMPORTED)
|
||||
set_target_properties(ggml::all
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
|
||||
|
||||
endif()
|
||||
|
||||
check_required_components(ggml)
|
||||
|
||||
@@ -647,6 +647,7 @@ struct ggml_backend_sched {
|
||||
// pipeline parallelism support
|
||||
int n_copies;
|
||||
int cur_copy;
|
||||
int next_copy;
|
||||
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
||||
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
||||
int n_graph_inputs;
|
||||
@@ -1433,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
}
|
||||
|
||||
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -1535,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1550,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
|
||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
||||
GGML_ASSERT(!sched->is_alloc);
|
||||
|
||||
sched->cur_copy = sched->next_copy;
|
||||
sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
|
||||
|
||||
ggml_backend_sched_split_graph(sched, graph);
|
||||
|
||||
@@ -1590,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||
// if the graph is not already allocated, always use copy 0 after a synchronization
|
||||
// this ensures that during generation the same copy is used every time,
|
||||
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
|
||||
sched->cur_copy = 0;
|
||||
sched->next_copy = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -77,6 +77,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
for (int i = 0; i < final_dims; i++) {
|
||||
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
|
||||
}
|
||||
size_t elem_offset = offset / ggml_element_size(tensor);
|
||||
acl_storage_len += elem_offset;
|
||||
|
||||
// Reverse ne and stride.
|
||||
std::reverse(acl_ne, acl_ne + final_dims);
|
||||
@@ -84,7 +86,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
|
||||
aclTensor* acl_tensor = aclCreateTensor(
|
||||
acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
|
||||
offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
|
||||
elem_offset, format, &acl_storage_len, 1,
|
||||
tensor->data);
|
||||
|
||||
return acl_tensor;
|
||||
|
||||
@@ -99,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_unary_op(
|
||||
void ggml_cann_op_unary(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
@@ -111,6 +111,42 @@ void ggml_cann_unary_op(
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_op_unary_gated(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
|
||||
if(src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
|
||||
acl_src0 = ggml_cann_create_tensor(src0);
|
||||
acl_src1 = ggml_cann_create_tensor(src1);
|
||||
} else {
|
||||
int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
|
||||
size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
|
||||
acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
|
||||
acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
|
||||
if (swapped) {
|
||||
std::swap(acl_src0, acl_src1);
|
||||
}
|
||||
}
|
||||
|
||||
unary_op(ctx, acl_src0, acl_dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_dst);
|
||||
if(src1)
|
||||
ggml_cann_release_resources(ctx, acl_src1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Repeats elements of a tensor along each dimension according to the
|
||||
* specified repeat array.
|
||||
|
||||
@@ -1098,7 +1098,7 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
* @param dst The destination tensor. Its src[0] is treated as the input tensor.
|
||||
*/
|
||||
template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
@@ -1109,49 +1109,125 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
|
||||
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
|
||||
*
|
||||
* @details This function performs a unary operation on the input tensor using
|
||||
* a user-provided lambda or callable object `unary_op`, which accepts the CANN
|
||||
* context and two ACL tensors (source and destination). Internally, this function
|
||||
* creates ACL representations of the ggml tensors and invokes the unary operation.
|
||||
* The result is stored in the destination tensor `dst`. This utility abstracts the
|
||||
* common boilerplate of tensor conversion and cleanup when implementing unary ops.
|
||||
* @details This function applies a unary operation to the input tensor using
|
||||
* a user-provided lambda or callable `unary_op`. The lambda receives the
|
||||
* CANN backend context and two ACL tensors: the source and the destination.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN APIs.
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* The source tensor is retrieved from `dst->src[0]`.
|
||||
* Internally, this function handles the conversion from GGML tensors to ACL tensors,
|
||||
* calls the provided unary op, and manages resource cleanup. The input is assumed
|
||||
* to be `dst->src[0]`, and the result is written to `dst`.
|
||||
*
|
||||
* This utility simplifies writing unary op wrappers by abstracting tensor preparation.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN ACL APIs.
|
||||
* @param ctx The CANN context for operation execution.
|
||||
* @param dst The destination ggml_tensor where the result will be stored.
|
||||
* The input tensor is assumed to be `dst->src[0]`.
|
||||
*
|
||||
* @see GGML_CANN_CALL_OP_UNARY
|
||||
*/
|
||||
void ggml_cann_unary_op(
|
||||
void ggml_cann_op_unary(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
|
||||
* @brief Applies a gated (GLU-style) unary operation using the CANN backend.
|
||||
*
|
||||
* This macro defines an inline lambda wrapping a specific ACL operation name,
|
||||
* and passes it to the templated ggml_cann_unary_op function. It simplifies
|
||||
* calling unary ops by hiding the lambda boilerplate.
|
||||
* @details This function performs a gated activation such as GEGLU or ReGLU.
|
||||
* It supports two input modes:
|
||||
*
|
||||
* 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
|
||||
* These are used directly as the value and gate tensors.
|
||||
*
|
||||
* 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
|
||||
* contain a concatenation of value and gate along the first dimension. This tensor
|
||||
* will be split into two equal halves to form the value and gate inputs.
|
||||
*
|
||||
* The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
|
||||
* then multiplies the result in-place with the gate tensor:
|
||||
*
|
||||
* Internally, the lambda will call:
|
||||
* @code
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* dst = unary_op(value) * gate;
|
||||
* @endcode
|
||||
*
|
||||
* The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
|
||||
* order of value/gate in the packed input case.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN ACL APIs.
|
||||
* It receives (ctx, acl_value_tensor, acl_output_tensor).
|
||||
* @param ctx The CANN context used for execution.
|
||||
* @param dst The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
|
||||
*
|
||||
* @see GGML_CANN_CALL_OP_UNARY_GATED
|
||||
*/
|
||||
void ggml_cann_op_unary_gated(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
|
||||
*
|
||||
* This macro wraps the specified ACLNN unary operator name into a lambda expression,
|
||||
* and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
|
||||
* unary ops in the CANN backend.
|
||||
*
|
||||
* Internally, this macro expands to a lambda like:
|
||||
* @code
|
||||
* [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* };
|
||||
* @endcode
|
||||
*
|
||||
* This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
|
||||
*
|
||||
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
||||
*
|
||||
* @see ggml_cann_unary_op
|
||||
* @see ggml_cann_op_unary
|
||||
* @see GGML_CANN_CALL_ACLNN_OP
|
||||
*/
|
||||
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
||||
#define GGML_CANN_CALL_OP_UNARY(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_unary_op(lambda, ctx, dst); \
|
||||
ggml_cann_op_unary(lambda, ctx, dst); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
/**
|
||||
* @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
|
||||
*
|
||||
* This macro wraps the specified ACLNN unary operator name into a lambda expression,
|
||||
* and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
|
||||
* executing gated unary ops in the CANN backend.
|
||||
*
|
||||
* Internally, this macro expands to a lambda like:
|
||||
* @code
|
||||
* [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* };
|
||||
* @endcode
|
||||
*
|
||||
* This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
|
||||
*
|
||||
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
||||
*
|
||||
* @see ggml_cann_op_unary_gated
|
||||
* @see GGML_CANN_CALL_ACLNN_OP
|
||||
*/
|
||||
#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_op_unary_gated(lambda, ctx, dst); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
#endif // CANN_ACLNN_OPS
|
||||
|
||||
@@ -1681,16 +1681,18 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_ABS:
|
||||
GGML_CANN_CALL_UNARY_OP(Abs);
|
||||
GGML_CANN_CALL_OP_UNARY(Abs);
|
||||
break;
|
||||
case GGML_UNARY_OP_NEG:
|
||||
GGML_CANN_CALL_UNARY_OP(Neg);
|
||||
GGML_CANN_CALL_OP_UNARY(Neg);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
GGML_CANN_CALL_UNARY_OP(Gelu);
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
// aclnnGelu internally uses the erf-based approximation.
|
||||
GGML_CANN_CALL_OP_UNARY(Gelu);
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
GGML_CANN_CALL_UNARY_OP(Silu);
|
||||
GGML_CANN_CALL_OP_UNARY(Silu);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK: {
|
||||
auto lambda = [](ggml_backend_cann_context& ctx,
|
||||
@@ -1698,31 +1700,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_unary_op(lambda, ctx, dst);
|
||||
ggml_cann_op_unary(lambda, ctx, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
GGML_CANN_CALL_UNARY_OP(Tanh);
|
||||
GGML_CANN_CALL_OP_UNARY(Tanh);
|
||||
break;
|
||||
case GGML_UNARY_OP_RELU:
|
||||
GGML_CANN_CALL_UNARY_OP(Relu);
|
||||
GGML_CANN_CALL_OP_UNARY(Relu);
|
||||
break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
GGML_CANN_CALL_UNARY_OP(Sigmoid);
|
||||
GGML_CANN_CALL_OP_UNARY(Sigmoid);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
|
||||
GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
GGML_CANN_CALL_UNARY_OP(Hardswish);
|
||||
GGML_CANN_CALL_OP_UNARY(Hardswish);
|
||||
break;
|
||||
case GGML_UNARY_OP_EXP:
|
||||
GGML_CANN_CALL_UNARY_OP(Exp);
|
||||
GGML_CANN_CALL_OP_UNARY(Exp);
|
||||
break;
|
||||
case GGML_UNARY_OP_ELU:
|
||||
ggml_cann_elu(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
GGML_CANN_CALL_UNARY_OP(Sign);
|
||||
GGML_CANN_CALL_OP_UNARY(Sign);
|
||||
break;
|
||||
case GGML_UNARY_OP_STEP:
|
||||
ggml_cann_step(ctx, dst);
|
||||
@@ -1731,6 +1733,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(dst)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Relu);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
// aclnnGelu internally uses the erf-based approximation.
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
|
||||
break;
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK: {
|
||||
auto lambda = [](ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_src,
|
||||
aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_op_unary_gated(lambda, ctx, dst);
|
||||
} break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_NORM:
|
||||
ggml_cann_norm(ctx, dst);
|
||||
break;
|
||||
@@ -1773,7 +1800,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
ggml_cann_binary_op<aclnn_mul>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SQRT:
|
||||
GGML_CANN_CALL_UNARY_OP(Sqrt);
|
||||
GGML_CANN_CALL_OP_UNARY(Sqrt);
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
ggml_cann_clamp(ctx, dst);
|
||||
@@ -1818,16 +1845,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
ggml_cann_argmax(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
ggml_cann_unary_op<aclnn_cos>(ctx, dst);
|
||||
ggml_cann_op_unary<aclnn_cos>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_cann_unary_op<aclnn_sin>(ctx, dst);
|
||||
ggml_cann_op_unary<aclnn_sin>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_cann_conv_transpose_1d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_LOG:
|
||||
GGML_CANN_CALL_UNARY_OP(Log);
|
||||
GGML_CANN_CALL_OP_UNARY(Log);
|
||||
break;
|
||||
case GGML_OP_MEAN:
|
||||
ggml_cann_mean(ctx, dst);
|
||||
@@ -2101,10 +2128,23 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(op)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
|
||||
@@ -70,10 +70,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
endif()
|
||||
@@ -456,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list(APPEND ARCH_FLAGS -march=z16)
|
||||
elseif (${S390X_M} MATCHES "9175|9176")
|
||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
||||
message(STATUS "z17 target")
|
||||
list(APPEND ARCH_FLAGS -march=z17)
|
||||
else()
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cstdlib> // for qsort
|
||||
#include <cstdio> // for GGML_ASSERT
|
||||
|
||||
#include "repack.h"
|
||||
|
||||
@@ -56,7 +56,7 @@
|
||||
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
||||
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
||||
|
||||
@@ -72,8 +72,9 @@
|
||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
||||
|
||||
// Moore Threads
|
||||
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
@@ -226,6 +227,10 @@ typedef float2 dfloat2;
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
|
||||
#define AMD_MFMA_AVAILABLE
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
@@ -288,6 +293,11 @@ static bool fp32_mma_hardware_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_CDNA(cc);
|
||||
}
|
||||
|
||||
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
|
||||
static bool amd_mfma_available(const int cc) {
|
||||
return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
static bool new_mma_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
||||
@@ -765,7 +775,7 @@ struct ggml_tensor_extra_gpu {
|
||||
};
|
||||
|
||||
|
||||
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
|
||||
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
|
||||
#define USE_CUDA_GRAPH
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#include "cpy.cuh"
|
||||
#include "dequantize.cuh"
|
||||
#include "cpy-utils.cuh"
|
||||
#ifdef GGML_USE_MUSA
|
||||
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||
#include "ggml-musa/mudnn.cuh"
|
||||
#endif // GGML_USE_MUSA
|
||||
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
@@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
|
||||
// Copy destination pointers to GPU to be available when pointer indirection is in use
|
||||
|
||||
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||
if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
if (cuda_graph->dest_ptrs_d != nullptr) {
|
||||
@@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
|
||||
char ** dest_ptrs_d = nullptr;
|
||||
int graph_cpynode_index = -1;
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
||||
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
||||
@@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
#endif
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
#ifdef GGML_USE_MUSA
|
||||
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
||||
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
||||
} else
|
||||
#endif // GGML_USE_MUSA
|
||||
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||
{
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
@@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||
}
|
||||
|
||||
@@ -1330,14 +1330,16 @@ static __global__ void flash_attn_ext_f16(
|
||||
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -37,16 +37,16 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
@@ -282,16 +282,16 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -174,7 +174,10 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
||||
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
@@ -291,10 +294,6 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
}
|
||||
}
|
||||
|
||||
K += gridDim.y*D * nb11;
|
||||
V += gridDim.y*D * nb21;
|
||||
maskh += gridDim.y*D;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@@ -329,16 +328,16 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -180,7 +180,10 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
||||
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
@@ -286,10 +289,6 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
}
|
||||
}
|
||||
|
||||
K += gridDim.y*D * nb11;
|
||||
V += gridDim.y*D * nb21;
|
||||
maskh += gridDim.y*D;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
+111
-3
@@ -12,7 +12,8 @@
|
||||
// The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile.
|
||||
// All matrix tiles have ne physical 32 bit elements per warp.
|
||||
//
|
||||
// As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
|
||||
// As described in the PTX documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
|
||||
// The API in this file also assumes that the pointers for load_generic are aligned to 16 bytes, unaligned pointers are considered undefined behavior.
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
@@ -66,7 +67,44 @@ namespace ggml_cuda_mma {
|
||||
struct tile {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
static constexpr int ne = I * J / 64;
|
||||
T x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
return threadIdx.x % 32;
|
||||
} else if constexpr (I == 16 && J == 16) {
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 32) {
|
||||
return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
|
||||
} else {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return (2 * ((threadIdx.x / 16) % 2) + l);
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return 2 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
return 2 * (threadIdx.x / 32) + l;
|
||||
} else if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 32 && J == 32) {
|
||||
return threadIdx.x % 32;
|
||||
} else {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
@@ -94,6 +132,7 @@ namespace ggml_cuda_mma {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
@@ -148,10 +187,23 @@ namespace ggml_cuda_mma {
|
||||
|
||||
template <int I, int J, typename T>
|
||||
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
} else {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
#endif // defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -186,7 +238,7 @@ namespace ggml_cuda_mma {
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#if defined(NEW_MMA_AVAILABLE)
|
||||
int * xi = (int * ) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||
@@ -393,4 +445,60 @@ namespace ggml_cuda_mma {
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#endif // defined(CDNA3)
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
|
||||
int32x16_t * acc = (int32x16_t *) D.x;
|
||||
#if defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#endif // defined(CDNA3)
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +109,8 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s03 = src0->nb[3] / ts_src0;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
||||
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
|
||||
|
||||
if (!ids) {
|
||||
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
@@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q(
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
|
||||
&& src1_ncols == ne11;
|
||||
const mmq_args args = {
|
||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
|
||||
@@ -304,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (new_mma_available(cc)) {
|
||||
if (new_mma_available(cc) || amd_mfma_available(cc)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
+1160
-697
File diff suppressed because it is too large
Load Diff
@@ -44,6 +44,9 @@ static __global__ void k_set_rows_quant(
|
||||
block_type * dst_block = dst_row_ptr + i00 / qk;
|
||||
|
||||
quantize_func(src_block, dst_block);
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
}
|
||||
|
||||
// Template dispatch function for quantized set_rows
|
||||
|
||||
Vendored
+13
-1
@@ -160,7 +160,19 @@
|
||||
#endif
|
||||
|
||||
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
|
||||
#define CDNA
|
||||
#define CDNA // For the entire family
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__)
|
||||
#define CDNA3
|
||||
#endif
|
||||
|
||||
#if defined(__gfx90a__)
|
||||
#define CDNA2
|
||||
#endif
|
||||
|
||||
#if defined(__gfx908__)
|
||||
#define CDNA1
|
||||
#endif
|
||||
|
||||
#if defined(__GFX12__)
|
||||
|
||||
Vendored
+2
-2
@@ -13,7 +13,7 @@
|
||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
|
||||
#define CUDA_R_16F MUSA_R_16F
|
||||
#define CUDA_R_16BF MUSA_R_16BF
|
||||
#define CUDA_R_32F MUSA_R_32F
|
||||
@@ -29,7 +29,7 @@
|
||||
#define cublasSgemm mublasSgemm
|
||||
#define cublasStatus_t mublasStatus_t
|
||||
#define cublasOperation_t mublasOperation_t
|
||||
#define cublasGetStatusString mublasStatus_to_string
|
||||
#define cublasGetStatusString mublasGetStatusString
|
||||
#define cudaDataType_t musaDataType_t
|
||||
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||
|
||||
@@ -528,6 +528,7 @@ typedef struct {
|
||||
int64_t n_group;
|
||||
int64_t n_seq_tokens;
|
||||
int64_t n_seqs;
|
||||
int64_t s_off;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
|
||||
@@ -3141,6 +3141,7 @@ static int ggml_metal_encode_node(
|
||||
/*.n_group =*/ n_group,
|
||||
/*.n_seq_tokens =*/ n_seq_tokens,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.s_off =*/ ggml_nelements(src1) * sizeof(float),
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
@@ -3169,12 +3170,22 @@ static int ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:7];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:8];
|
||||
|
||||
// One shared memory bucket for each simd group in the threadgroup
|
||||
// NOTE: Metal kernels require the buffer size to be multiple of 16 bytes
|
||||
// https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
|
||||
if (d_state >= 32) {
|
||||
GGML_ASSERT((int64_t)(d_state / 32) <= 32);
|
||||
const int64_t shmem_size = 32;
|
||||
GGML_ASSERT(d_state <= (int64_t)pipeline.maxTotalThreadsPerThreadgroup);
|
||||
[encoder setThreadgroupMemoryLength:(shmem_size)*sizeof(float) atIndex:0];
|
||||
}
|
||||
|
||||
if (ne30 == 1) {
|
||||
// Mamba-2
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
|
||||
} else {
|
||||
GGML_ASSERT(d_inner == 1);
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
|
||||
@@ -1823,10 +1823,16 @@ kernel void kernel_ssm_scan_f32(
|
||||
device const void * src5,
|
||||
device const void * src6,
|
||||
device float * dst,
|
||||
threadgroup float * shared [[threadgroup(0)]],
|
||||
constant ggml_metal_kargs_ssm_scan & args,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgptg[[simdgroups_per_threadgroup]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]) {
|
||||
|
||||
const int64_t i0 = tpitg.x;
|
||||
const int64_t i1 = 0;
|
||||
const int64_t ir = tgpig.x; // current head
|
||||
const int64_t i3 = tgpig.y; // current seq
|
||||
@@ -1841,41 +1847,88 @@ kernel void kernel_ssm_scan_f32(
|
||||
const int64_t ng = args.n_group;
|
||||
const int64_t n_t = args.n_seq_tokens;
|
||||
|
||||
const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float);
|
||||
const int64_t s_off = args.s_off;
|
||||
|
||||
device const int32_t * ids = (device const int32_t *) src6;
|
||||
|
||||
device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
const int64_t i = i0 + i1*nc;
|
||||
float s0 = s0_buff[i];
|
||||
float s = s_buff[i];
|
||||
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31);
|
||||
device const float * x_block = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i3*args.nb13);
|
||||
device const float * dt_block = (device const float *) ((device const char *) src2 + ir*nb20 + i3*args.nb22);
|
||||
device const float * B_block = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i3*args.nb43);
|
||||
device const float * C_block = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i3*args.nb53);
|
||||
device float * y_block = (device float *) ((device char *) dst + (i1 + ir*(nr) + i3*(n_t*nh*nr))*nb00);
|
||||
|
||||
for (int64_t i2 = 0; i2 < n_t; ++i2) {
|
||||
device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns}
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {d_state, nh}
|
||||
device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns}
|
||||
device const float * x = (device const float *) ((device const char *) x_block + i2*args.nb12); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) dt_block + i2*args.nb21); // {nh, nt, ns}
|
||||
device const float * B = (device const float *) ((device const char *) B_block + i2*args.nb42); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) C_block + i2*args.nb52); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) y_block + i2*(nh*nr*nb00)); // {dim, nh, nt, ns}
|
||||
|
||||
const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
|
||||
const float x_dt = x[0] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
const int64_t i = i0 + i1*nc;
|
||||
const float state = (s0[i] * exp(dt_soft_plus * A[i0])) + (B[i0] * x_dt);
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
const float state = (s0 * exp(dt_soft_plus * A[i0])) + (B[i0] * x_dt);
|
||||
s = state;
|
||||
|
||||
// Parallel sum: This relies on the fact that this kernel will be
|
||||
// dispatched with each threadgroup having (d_state, 1, 1) threads which
|
||||
// are subdivided into SIMD groups of size `sgptg`. The goal is to
|
||||
// compute y = sum({state * C[i] for i in range(d_state)}).
|
||||
// To parallelize this effectively, we first use simd_sum over each SIMD
|
||||
// group to compute the sum of each SIMD group, then place the result in
|
||||
// the SIMD group's indexed bucket in the shared memory. We then sum
|
||||
// over the individual group sums to compute the final sum.
|
||||
|
||||
// Computed for each thread
|
||||
float sumf = state * C[i0];
|
||||
|
||||
// Sum the threads in the simd group => simd sum
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
if (sgptg > 1) {
|
||||
|
||||
// Once per simd group, place the group sum into the shared buffer
|
||||
if (tiisg == 0) {
|
||||
shared[sgitg] = sumf;
|
||||
}
|
||||
|
||||
// Wait for all threads in the threadgroup to reach this point. This
|
||||
// ensures that all elements of the shared buffer are populated with the
|
||||
// sum of the individual simd groups.
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// For simd group 0 at indices < num simd groups, extract the shared
|
||||
// simd sum
|
||||
sumf = 0.0f;
|
||||
if (sgitg == 0) {
|
||||
if (tiisg < sgptg) {
|
||||
sumf = shared[tiisg];
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
y[0] = sumf;
|
||||
}
|
||||
}
|
||||
} else if (tiisg == 0) {
|
||||
y[0] = sumf;
|
||||
}
|
||||
|
||||
y[0] = sumf;
|
||||
|
||||
// recurse
|
||||
s0 = s;
|
||||
}
|
||||
|
||||
// Assign the final state to the output buffer
|
||||
s_buff[i] = s;
|
||||
}
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
|
||||
// TODO: optimize (e.g. by parallelizing over d_state)
|
||||
kernel void kernel_ssm_scan_f32_group(
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
@@ -1885,10 +1938,16 @@ kernel void kernel_ssm_scan_f32_group(
|
||||
device const void * src5,
|
||||
device const void * src6,
|
||||
device float * dst,
|
||||
threadgroup float * shared [[threadgroup(0)]],
|
||||
constant ggml_metal_kargs_ssm_scan & args,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgptg[[simdgroups_per_threadgroup]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]) {
|
||||
|
||||
const int64_t i0 = tpitg.x;
|
||||
const int64_t i1 = tgpig.x;
|
||||
const int64_t ir = tgpig.y; // current head
|
||||
const int64_t i3 = tgpig.z; // current seq
|
||||
@@ -1903,38 +1962,81 @@ kernel void kernel_ssm_scan_f32_group(
|
||||
const int64_t ng = args.n_group;
|
||||
const int64_t n_t = args.n_seq_tokens;
|
||||
|
||||
const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float);
|
||||
const int64_t s_off = args.s_off;
|
||||
|
||||
device const int32_t * ids = (device const int32_t *) src6;
|
||||
|
||||
device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
const int64_t i = i0 + i1*nc;
|
||||
float s0 = s0_buff[i];
|
||||
float s = s_buff[i];
|
||||
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh}
|
||||
device const float * x_block = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i3*args.nb13);
|
||||
device const float * dt_block = (device const float *) ((device const char *) src2 + ir*nb20 + i3*args.nb22);
|
||||
device const float * B_block = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i3*args.nb43);
|
||||
device const float * C_block = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i3*args.nb53);
|
||||
device float * y_block = (device float *) ((device char *) dst + (i1 + ir*(nr) + i3*(n_t*nh*nr))*nb00);
|
||||
|
||||
for (int64_t i2 = 0; i2 < n_t; ++i2) {
|
||||
device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns}
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh}
|
||||
device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns}
|
||||
device const float * x = (device const float *) ((device const char *) x_block + i2*args.nb12); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) dt_block + i2*args.nb21); // {nh, nt, ns}
|
||||
device const float * B = (device const float *) ((device const char *) B_block + i2*args.nb42); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) C_block + i2*args.nb52); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) y_block + i2*(nh*nr*nb00)); // {dim, nh, nt, ns}
|
||||
|
||||
const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
|
||||
const float x_dt = x[0] * dt_soft_plus;
|
||||
const float dA = exp(dt_soft_plus * A[0]);
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
const int64_t i = i0 + i1*nc;
|
||||
const float state = (s0[i] * dA) + (B[i0] * x_dt);
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
const float state = (s0 * dA) + (B[i0] * x_dt);
|
||||
s = state;
|
||||
|
||||
// Parallel sum: This relies on the fact that this kernel will be
|
||||
// dispatched with each threadgroup having (d_state, 1, 1) threads which
|
||||
// are subdivided into SIMD groups of size `sgptg`. The goal is to
|
||||
// compute y = sum({state * C[i] for i in range(d_state)}).
|
||||
// To parallelize this effectively, we first use simd_sum over each SIMD
|
||||
// group to compute the sum of each SIMD group, then place the result in
|
||||
// the SIMD group's indexed bucket in the shared memory. We then sum
|
||||
// over the individual group sums to compute the final sum.
|
||||
|
||||
// Computed for each thread
|
||||
float sumf = state * C[i0];
|
||||
|
||||
// Sum the threads in the simd group => simd sum
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
// Once per simd group, place the group sum into the shared buffer
|
||||
if (tiisg == 0) {
|
||||
shared[sgitg] = sumf;
|
||||
}
|
||||
|
||||
y[0] = sumf;
|
||||
// Wait for all threads in the threadgroup to reach this point. This
|
||||
// ensures that all elements of the shared buffer are populated with the
|
||||
// sum of the individual simd groups.
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// For simd group 0 at indices < num simd groups, extract the shared
|
||||
// simd sum
|
||||
sumf = 0.0f;
|
||||
if (sgitg == 0) {
|
||||
if (tiisg < sgptg) {
|
||||
sumf = shared[tiisg];
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
y[0] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
// recurse
|
||||
s0 = s;
|
||||
}
|
||||
|
||||
// Assign the final state to the output buffer
|
||||
s_buff[i] = s;
|
||||
}
|
||||
|
||||
kernel void kernel_rwkv_wkv6_f32(
|
||||
|
||||
@@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND)
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-musa/*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
|
||||
if (GGML_MUSA_MUDNN_COPY)
|
||||
file(GLOB SRCS "../ggml-musa/*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
add_compile_definitions(GGML_MUSA_MUDNN_COPY)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
||||
@@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND)
|
||||
add_compile_definitions(GGML_USE_MUSA)
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (GGML_MUSA_GRAPHS)
|
||||
add_compile_definitions(GGML_MUSA_GRAPHS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
@@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND)
|
||||
endif()
|
||||
|
||||
if (GGML_STATIC)
|
||||
# TODO: mudnn has not provided static libraries yet
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
||||
# TODO: mudnn has not provided static libraries yet
|
||||
# if (GGML_MUSA_MUDNN_COPY)
|
||||
# target_link_libraries(ggml-musa PRIVATE mudnn_static)
|
||||
# endif()
|
||||
else()
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
|
||||
if (GGML_MUSA_MUDNN_COPY)
|
||||
target_link_libraries(ggml-musa PRIVATE mudnn)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
|
||||
@@ -333,6 +333,7 @@ struct ggml_backend_opencl_context {
|
||||
size_t max_alloc_size;
|
||||
bool fp16_support;
|
||||
bool has_vector_subgroup_broadcast;
|
||||
bool disable_fusion;
|
||||
ggml_cl_compiler_version adreno_cl_compiler_version;
|
||||
|
||||
int adreno_wave_size;
|
||||
@@ -411,7 +412,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
|
||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
||||
cl_kernel kernel_norm;
|
||||
cl_kernel kernel_rms_norm;
|
||||
cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
|
||||
cl_kernel kernel_group_norm;
|
||||
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
||||
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
||||
@@ -1100,7 +1101,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
backend_ctx->program_rms_norm =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rms_norm_mul = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm_mul", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
@@ -2110,6 +2112,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
|
||||
|
||||
dev_ctx->backend_ctx = backend_ctx.release();
|
||||
return dev_ctx->backend_ctx;
|
||||
}
|
||||
@@ -2279,7 +2283,45 @@ static void sync_with_other_backends(ggml_backend_t backend) {
|
||||
sync_with_other_backends(backend_ctx);
|
||||
}
|
||||
|
||||
static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) {
|
||||
const ggml_tensor *rms_norm = cgraph->nodes[node_idx];
|
||||
const ggml_tensor *mul = cgraph->nodes[node_idx+1];
|
||||
|
||||
GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
|
||||
|
||||
// rms_norm only supports f32
|
||||
if (mul->src[0]->type != GGML_TYPE_F32 ||
|
||||
mul->src[1]->type != GGML_TYPE_F32 ||
|
||||
mul->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if rms_norm is the B operand, then we don't handle broadcast
|
||||
if (rms_norm == mul->src[1] &&
|
||||
!ggml_are_same_shape(mul->src[0], rms_norm->src[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// rms_norm assumes contiguous rows
|
||||
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor);
|
||||
|
||||
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2292,6 +2334,12 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ggml_opencl_op_rms_norm_fused(backend, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
bool ok = ggml_cl_compute_forward(backend, node);
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
@@ -4455,6 +4503,117 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) {
|
||||
GGML_ASSERT(mul_tensor);
|
||||
GGML_ASSERT(rms_norm_tensor);
|
||||
|
||||
// src0 is the src of rms_norm, src1 is the other src of mul (one being rms_norm)
|
||||
const ggml_tensor * src0 = rms_norm_tensor->src[0];
|
||||
const ggml_tensor * src1;
|
||||
if (mul_tensor->src[0] == rms_norm_tensor) {
|
||||
src1 = mul_tensor->src[1];
|
||||
} else if (mul_tensor->src[1] == rms_norm_tensor) {
|
||||
src1 = mul_tensor->src[0];
|
||||
} else {
|
||||
GGML_ASSERT(false && "Invalid args for rms_norm and mul");
|
||||
}
|
||||
const ggml_tensor * dst = mul_tensor;
|
||||
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3];
|
||||
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
|
||||
size_t sgs;
|
||||
if (backend_ctx->gpu_family == ADRENO) {
|
||||
sgs = 64;
|
||||
} else if (backend_ctx->gpu_family == INTEL) {
|
||||
sgs = 32;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported GPU");
|
||||
}
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_rms_norm_mul;
|
||||
|
||||
int nth = sgs;
|
||||
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
while (nth < ne00 && nth < max_workgroup_size) {
|
||||
nth *= 2;
|
||||
}
|
||||
nth = MIN(nth, max_workgroup_size);
|
||||
nth = MIN(nth, ne00);
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*nth/sgs, NULL));
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
|
||||
@@ -94,3 +94,82 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// rms_norm_mul
|
||||
//------------------------------------------------------------------------------
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_32
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_rms_norm_mul(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
float eps,
|
||||
local float * sum
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float4 * x = (global float4 *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
global float4 * f = (global float4 *) (src1 + (i03%ne13)*nb13 + (i02%ne12)*nb12 + (i01%ne11)*nb11);
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
sumf += dot(x[i00], x[i00]);
|
||||
}
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
sum[get_sub_group_id()] = sumf;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
|
||||
if (get_local_id(0) < i) {
|
||||
sum[get_local_id(0)] += sum[get_local_id(0) + i];
|
||||
}
|
||||
}
|
||||
if (get_local_id(0) == 0) {
|
||||
sum[0] /= ne00;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
float mean = sum[0];
|
||||
float scale = 1.0f/sqrt(mean + eps);
|
||||
|
||||
global float4 * y = (global float4 *) (dst + i03*nb3 + i02*nb2 + i01*nb1);
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
y[i00] = (x[i00] * scale) * f[i00%(ne10/4)];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1055,7 +1055,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
GGML_ASSERT(ctx_ptr != nullptr);
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
||||
if (tensor == nullptr) {
|
||||
if (tensor == nullptr || tensor->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -1124,7 +1124,7 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
|
||||
GGML_ASSERT(ctx_ptr != nullptr);
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||
if (tensor == nullptr) {
|
||||
if (tensor == nullptr || tensor->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -1192,7 +1192,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
||||
GGML_ASSERT(ctx_ptr != nullptr);
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||
if (tensor == nullptr) {
|
||||
if (tensor == nullptr || tensor->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -1229,7 +1229,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||
|
||||
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
|
||||
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
|
||||
if (src == nullptr || dst == nullptr) {
|
||||
if (src == nullptr || dst == nullptr || src->buffer == nullptr || dst->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "mmvq.hpp"
|
||||
#include "norm.hpp"
|
||||
#include "outprod.hpp"
|
||||
#include "quantize.hpp"
|
||||
#include "quants.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "set_rows.hpp"
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
#include "ggml-sycl/set_rows.hpp"
|
||||
#include "ggml-sycl/sycl_hw.hpp"
|
||||
#include "ggml-sycl/getrows.hpp"
|
||||
#include "ggml-sycl/quantize.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
static bool g_sycl_loaded = false;
|
||||
@@ -1373,120 +1374,6 @@ typedef void (*ggml_sycl_op_mul_mat_t)(
|
||||
|
||||
|
||||
|
||||
template<int QUANT_BLOCK_TILE>
|
||||
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
|
||||
|
||||
if (ix >= kx_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
|
||||
const int i_padded = iy*kx_padded + ix;
|
||||
|
||||
block_q8_1 * y = (block_q8_1 *) vy;
|
||||
|
||||
const int ib = i_padded / QK8_1; // block index
|
||||
const int iqs = i_padded % QK8_1; // quant index
|
||||
typedef sycl::vec<float, QUANT_BLOCK_TILE> TC;
|
||||
typedef sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
|
||||
TC zeros;
|
||||
TQ qzeros;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QUANT_BLOCK_TILE; i++)
|
||||
{
|
||||
zeros[i] = 0.f;
|
||||
qzeros[i] = 0;
|
||||
}
|
||||
const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
|
||||
float sum = xi[0];
|
||||
float amax = sycl::fabs(xi[0]);
|
||||
#pragma unroll
|
||||
for (int i = 1; i < QUANT_BLOCK_TILE; i++)
|
||||
{
|
||||
sum += xi[i];
|
||||
amax = sycl::fmax(sycl::fabs(xi[i]), amax);
|
||||
}
|
||||
sum = warp_reduce_sum(sum, item_ct1);
|
||||
amax = warp_reduce_max(amax, item_ct1);
|
||||
|
||||
const float d = amax / 127;
|
||||
TQ q = qzeros;
|
||||
if (amax != 0.0f)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
|
||||
q[i] = sycl::round(xi[i] / d);
|
||||
}
|
||||
}
|
||||
|
||||
*(TQ *)&y[ib].qs[iqs] = q;
|
||||
|
||||
if (iqs > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
|
||||
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
|
||||
}
|
||||
|
||||
template <int ElementsPerWI>
|
||||
static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
|
||||
const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
|
||||
/*
|
||||
Quantizes and reorders the resultant q8 tensor in a per row fashion
|
||||
Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
|
||||
*/
|
||||
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
const int num_blocks_per_row = kx / QK8_1;
|
||||
auto row = subgroup_id / num_blocks_per_row;
|
||||
auto col = subgroup_id % num_blocks_per_row;
|
||||
|
||||
auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
|
||||
auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
|
||||
|
||||
auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
|
||||
auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
|
||||
|
||||
sycl::vec<float, ElementsPerWI> wi_f32_vals;
|
||||
sycl::vec<int8_t, ElementsPerWI> quantized_values;
|
||||
|
||||
auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
|
||||
wi_f32_vals = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
|
||||
|
||||
float sum = 0.0f;
|
||||
float amax = 0.0f;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
sum += wi_f32_vals[i];
|
||||
amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
|
||||
quantized_values[i] = 0;
|
||||
}
|
||||
sum = sycl::reduce_over_group(it.get_group(), sum, sycl::plus<float>());
|
||||
amax = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum<float>());
|
||||
float d = amax == 0 ? 1 : amax / 127;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
|
||||
}
|
||||
|
||||
d = amax == 0 ? 0 : d;
|
||||
|
||||
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
|
||||
if (wi_id == 0) {
|
||||
*ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_p021_f16_f32(
|
||||
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
|
||||
@@ -1770,32 +1657,6 @@ static void pool2d_nchw_kernel(
|
||||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
|
||||
bool reorder_q8_tensor, queue_ptr stream) {
|
||||
if (reorder_q8_tensor) {
|
||||
auto local_range = std::size_t(WARP_SIZE);
|
||||
auto num_quant_blocks = ky * (kx / QK8_1);
|
||||
auto global_range = num_quant_blocks * local_range;
|
||||
stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
|
||||
[=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_and_reorder_q8_1<QK8_1 / WARP_SIZE>(x, vy, kx, kx_padded, it);
|
||||
});
|
||||
} else {
|
||||
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
||||
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
||||
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
||||
static_assert(QK8_1 % WARP_SIZE == 0);
|
||||
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
||||
float *dst, const int ncols_x,
|
||||
@@ -2372,10 +2233,10 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
||||
peer_access_enabled = enable_peer_access;
|
||||
}
|
||||
|
||||
template <template <int> typename quantize_f>
|
||||
static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
ggml_sycl_op_mul_mat_t op,
|
||||
const bool convert_src1_to_q8_1) try {
|
||||
ggml_sycl_op_mul_mat_t op) try {
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
||||
|
||||
@@ -2470,6 +2331,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
}
|
||||
}
|
||||
|
||||
constexpr bool quantize_enabled = !std::is_same_v<quantize_f<QK8_1 / WARP_SIZE>,
|
||||
no_quantize_q8_1<QK8_1 / WARP_SIZE>>;
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
|
||||
continue;
|
||||
@@ -2495,20 +2358,19 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1) {
|
||||
if constexpr(quantize_enabled) {
|
||||
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder;
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream);
|
||||
/*
|
||||
DPCT1010:90: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The call was replaced with 0. You need to
|
||||
rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(0);
|
||||
try {
|
||||
quantize_row_q8_1_sycl<quantize_f>(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << "Quantize_row_q8_1_sycl error" << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2524,11 +2386,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
// here an event is recorded that signals that the main device has finished calculating the input data
|
||||
if (split && used_devices > 1) {
|
||||
ggml_sycl_set_device(ctx.device);
|
||||
/*
|
||||
DPCT1024:91: The original code returned the error code that was further
|
||||
consumed by the program logic. This original code was replaced with 0.
|
||||
You may need to rewrite the program logic consuming the error code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
*src0_extra->events[ctx.device][0] =
|
||||
ctx.stream()->ext_oneapi_submit_barrier()));
|
||||
@@ -2552,11 +2409,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
|
||||
// wait for main GPU data if necessary
|
||||
if (split && (i != ctx.device || is != 0)) {
|
||||
/*
|
||||
DPCT1009:163: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The original code was commented out and a
|
||||
warning string was inserted. You need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->ext_oneapi_submit_barrier(
|
||||
{*src0_extra->events[ctx.device][0]})));
|
||||
}
|
||||
@@ -2582,39 +2434,42 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
// copy src0, src1 to device if necessary
|
||||
if (src1_is_contiguous) {
|
||||
if (i != ctx.device) {
|
||||
if (convert_src1_to_q8_1) {
|
||||
if constexpr (quantize_enabled) {
|
||||
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
||||
src1_ddq_i, src1_ddq_i_source,
|
||||
src1_ncols * src1_padded_col_size * q8_1_ts /
|
||||
q8_1_bs).wait()));
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(stream
|
||||
->memcpy(src1_ddq_i, src1_ddq_i_source,
|
||||
src1_ncols * src1_padded_col_size * q8_1_ts / q8_1_bs)
|
||||
.wait()));
|
||||
} else {
|
||||
|
||||
float * src1_ddf_i_source = (float *) src1_extra->data_device[ctx.device];
|
||||
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
||||
src1_ddf_i_source += (i0 * ne11 + src1_col_0) * ne10;
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream,
|
||||
src1_ddf_i, src1_ddf_i_source,
|
||||
src1_ncols * ne10 * sizeof(float))));
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source,
|
||||
src1_ncols * ne10 * sizeof(float))));
|
||||
}
|
||||
}
|
||||
} else if (src1_on_device && !src1_is_contiguous) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (src1_on_device) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, src1_col_0,
|
||||
src1_col_0 + src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ABORT("src1 is non-contiguous and not on device");
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream);
|
||||
/*
|
||||
DPCT1010:92: SYCL uses exceptions to report errors and does
|
||||
not use the error codes. The call was replaced with 0. You
|
||||
need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(0);
|
||||
if constexpr (quantize_enabled) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
try {
|
||||
quantize_row_q8_1_sycl<quantize_q8_1>(src1_ddf_i, src1_ddq_i, ne10, src1_ncols,
|
||||
src1_padded_col_size, stream);
|
||||
} catch (const sycl::exception & exc) {
|
||||
std::cerr << "Quantize_row_q8_1_sycl error" << exc.what()
|
||||
<< "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
||||
@@ -2626,12 +2481,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
// do the computation
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
||||
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
|
||||
/*
|
||||
DPCT1010:93: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The call was replaced with 0. You need to
|
||||
rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(0);
|
||||
|
||||
// copy dst to host or other device if necessary
|
||||
if (!dst_on_device) {
|
||||
@@ -2662,12 +2511,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
|
||||
// add event for the main device to wait on until other device is done
|
||||
if (split && (i != ctx.device || is != 0)) {
|
||||
/*
|
||||
DPCT1024:94: The original code returned the error code that
|
||||
was further consumed by the program logic. This original
|
||||
code was replaced with 0. You may need to rewrite the
|
||||
program logic consuming the error code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
*src0_extra->events[i][is] =
|
||||
stream->ext_oneapi_submit_barrier()));
|
||||
@@ -3351,19 +3194,20 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
// KQ + KQV multi-batch
|
||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||
} else if (use_dequantize_mul_mat_vec) {
|
||||
constexpr bool convert_src1_to_q8_1 = false;
|
||||
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV);
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
|
||||
ggml_sycl_op_mul_mat<no_quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec);
|
||||
} else if (use_mul_mat_vec_q) {
|
||||
constexpr bool convert_src1_to_q8_1 = true;
|
||||
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ);
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
|
||||
ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
|
||||
if (extra && extra->optimized_feature.reorder) {
|
||||
ggml_sycl_op_mul_mat<quantize_and_reorder_q8_1_soa>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q);
|
||||
} else {
|
||||
ggml_sycl_op_mul_mat<quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q);
|
||||
}
|
||||
} else if (use_mul_mat_q) {
|
||||
constexpr bool convert_src1_to_q8_1 = true;
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
|
||||
ggml_sycl_op_mul_mat<quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q);
|
||||
} else {
|
||||
constexpr bool convert_src1_to_q8_1 = false;
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
|
||||
ggml_sycl_op_mul_mat<no_quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,133 @@
|
||||
/***************************************************************************
|
||||
*
|
||||
* Copyright (C) 2025 Codeplay Software Ltd.
|
||||
* Copyright (C) 2025 Intel Corporation
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* quantize.hpp
|
||||
*
|
||||
* Description:
|
||||
* Sycl backend specific quantization functions
|
||||
**************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <sycl/nd_item.hpp>
|
||||
|
||||
#include "ggml-sycl/dpct/helper.hpp"
|
||||
|
||||
template <int ElementsPerWI>
|
||||
__dpct_inline__ static void quantize_q8_1_impl(const float * __restrict__ x,
|
||||
sycl::vec<int8_t, ElementsPerWI> & quantized_values, float & d,
|
||||
float & sum, const sycl::nd_item<1> & it) {
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
sycl::vec<float, ElementsPerWI> wi_f32_vals;
|
||||
|
||||
auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
|
||||
wi_f32_vals = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
|
||||
|
||||
float amax = 0.0f;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
sum += wi_f32_vals[i];
|
||||
amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
|
||||
quantized_values[i] = 0;
|
||||
}
|
||||
sum = sycl::reduce_over_group(it.get_sub_group(), sum, sycl::plus<float>());
|
||||
amax = sycl::reduce_over_group(it.get_sub_group(), amax, sycl::maximum<float>());
|
||||
d = amax == 0 ? 1 : amax / 127;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
|
||||
}
|
||||
|
||||
d = amax == 0 ? 0 : d;
|
||||
}
|
||||
|
||||
// No op to control codepath in ggml_sycl_op_mul_mat
|
||||
template <int ElementsPerWI> struct no_quantize_q8_1 {
|
||||
void operator()(const float *, void *, int, int, const sycl::nd_item<1> &) const {}
|
||||
};
|
||||
|
||||
template <int ElementsPerWI> struct quantize_and_reorder_q8_1_soa {
|
||||
__dpct_inline__ void operator()(const float * __restrict__ x, void * reordered_q8_tensor, const int kx,
|
||||
const int kx_padded, const sycl::nd_item<1> & it) const {
|
||||
/*
|
||||
Quantizes and reorders the resultant q8 tensor in a per row fashion
|
||||
Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
|
||||
*/
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
sycl::vec<int8_t, ElementsPerWI> quantized_values;
|
||||
float d = 0.0f;
|
||||
float sum = 0.0f;
|
||||
quantize_q8_1_impl<ElementsPerWI>(x, quantized_values, d, sum, it);
|
||||
|
||||
const int num_blocks_per_row = kx / QK8_1;
|
||||
auto row = subgroup_id / num_blocks_per_row;
|
||||
auto col = subgroup_id % num_blocks_per_row;
|
||||
auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
|
||||
auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
|
||||
|
||||
auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
|
||||
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
|
||||
|
||||
auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
|
||||
if (wi_id == 0) {
|
||||
*ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int ElementsPerWI> struct quantize_q8_1 {
|
||||
__dpct_inline__ void operator()(const float * __restrict__ x, void * q8_tensor, const int kx, const int kx_padded,
|
||||
const sycl::nd_item<1> & it) const {
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
const int num_blocks_per_row = kx / QK8_1;
|
||||
auto row = subgroup_id / num_blocks_per_row;
|
||||
const int pitch = kx_padded / QK8_1;
|
||||
|
||||
sycl::vec<int8_t, ElementsPerWI> quantized_values;
|
||||
float d = 0.0f;
|
||||
float sum = 0.0f;
|
||||
quantize_q8_1_impl<ElementsPerWI>(x, quantized_values, d, sum, it);
|
||||
|
||||
block_q8_1 * quant_ptr = (block_q8_1 *) q8_tensor;
|
||||
auto block_id = subgroup_id % num_blocks_per_row + row * pitch;
|
||||
|
||||
int8_t * qs = &(quant_ptr[block_id].qs[wi_id * ElementsPerWI]);
|
||||
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(qs) = quantized_values;
|
||||
if (wi_id == 0) {
|
||||
quant_ptr[block_id].ds = sycl::half2(sycl::half(d), sycl::half(sum));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <template <int> typename quantize_f>
|
||||
void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
|
||||
dpct::queue_ptr stream) {
|
||||
static_assert(QK8_1 % WARP_SIZE == 0);
|
||||
auto local_range = std::size_t(WARP_SIZE);
|
||||
auto num_quant_blocks = ky * (kx / QK8_1);
|
||||
auto global_range = num_quant_blocks * local_range;
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
|
||||
[=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_f<QK8_1 / WARP_SIZE>()(x, vy, kx, kx_padded, it);
|
||||
});
|
||||
}
|
||||
@@ -484,6 +484,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||
vk_pipeline pipeline_conv2d_f32;
|
||||
vk_pipeline pipeline_conv2d_f16_f32;
|
||||
vk_pipeline pipeline_conv2d_dw_whcn_f32;
|
||||
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
|
||||
|
||||
@@ -3074,12 +3075,21 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
|
||||
false);
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
|
||||
false);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -6958,9 +6968,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
return ctx->device->pipeline_conv2d_f32;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_conv2d_f32;
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_conv2d_f16_f32;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
@@ -7882,6 +7896,13 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
// Skip empty skip_rows operations. For most ops the empty check at the start
|
||||
// of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst
|
||||
// with empty srcs.
|
||||
if (ggml_is_empty(src0) || ggml_is_empty(src1)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
@@ -8178,13 +8199,13 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
|
||||
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
||||
@@ -10867,7 +10888,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE;
|
||||
// Channel-contiguous format is not supported yet.
|
||||
return (op->src[0]->type == GGML_TYPE_F32 &&
|
||||
return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
op->src[1]->type == GGML_TYPE_F32 &&
|
||||
op->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(op->src[0]) &&
|
||||
|
||||
@@ -656,6 +656,7 @@ void process_shaders() {
|
||||
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
|
||||
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
|
||||
|
||||
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
||||
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
||||
|
||||
+4
-6
@@ -6640,20 +6640,18 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
||||
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
||||
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
||||
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
||||
fprintf(fp, " \"%p\" -> \"%p\" [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
||||
gparent0 ? (void *) gparent0 : (void *) parent,
|
||||
gparent0 ? "g" : "x",
|
||||
gparent ? (void *) gparent : (void *) node,
|
||||
gparent ? "g" : "x",
|
||||
gparent ? "empty" : "vee",
|
||||
gparent ? "dashed" : "solid",
|
||||
label);
|
||||
}
|
||||
|
||||
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
||||
(void *) parent, "x",
|
||||
(void *) node, "x",
|
||||
fprintf(fp, " \"%p\" -> \"%p\" [ label = \"%s\"; ]\n",
|
||||
(void *) parent,
|
||||
(void *) node,
|
||||
label);
|
||||
}
|
||||
|
||||
|
||||
@@ -376,6 +376,7 @@ class MODEL_ARCH(IntEnum):
|
||||
SMOLLM3 = auto()
|
||||
LFM2 = auto()
|
||||
DREAM = auto()
|
||||
SMALLTHINKER = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
@@ -695,6 +696,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.SMOLLM3: "smollm3",
|
||||
MODEL_ARCH.LFM2: "lfm2",
|
||||
MODEL_ARCH.DREAM: "dream",
|
||||
MODEL_ARCH.SMALLTHINKER: "smallthinker",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
@@ -2483,6 +2485,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
],
|
||||
MODEL_ARCH.SMALLTHINKER: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
@@ -2704,6 +2724,7 @@ class VisionProjectorType:
|
||||
INTERNVL = "internvl"
|
||||
QWEN2A = "qwen2a" # audio
|
||||
QWEN25O = "qwen2.5o" # omni
|
||||
VOXTRAL = "voxtral"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -317,6 +317,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.feed_forward.router", # llama4 jamba
|
||||
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
|
||||
"model.layers.{bid}.mlp.gate.wg", # hunyuan
|
||||
"model.layers.{bid}.block_sparse_moe.primary_router", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||
@@ -362,6 +363,7 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
||||
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
|
||||
"transformer_encoder.{bid}.ffn.w12", # neobert
|
||||
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -372,6 +374,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.up", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||
@@ -401,6 +404,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
||||
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
@@ -410,6 +414,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
|
||||
"model.layers.{bid}.block_sparse_moe.experts.gate", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||
@@ -448,6 +453,7 @@ class TensorNameMap:
|
||||
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
||||
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
|
||||
"transformer_encoder.{bid}.ffn.w3", # neobert
|
||||
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
@@ -459,6 +465,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
import re
|
||||
import logging
|
||||
import json
|
||||
@@ -12,6 +13,25 @@ try:
|
||||
except ImportError:
|
||||
SentencePieceProcessor = None
|
||||
|
||||
try:
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
||||
from mistral_common.tokens.tokenizers.utils import (
|
||||
_filter_valid_tokenizer_files,
|
||||
)
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer,
|
||||
)
|
||||
except ImportError:
|
||||
_mistral_common_installed = False
|
||||
MistralTokenizer = None
|
||||
Tekkenizer = None
|
||||
SentencePieceTokenizer = None
|
||||
_filter_valid_tokenizer_files = None
|
||||
else:
|
||||
_mistral_common_installed = True
|
||||
|
||||
|
||||
import gguf
|
||||
|
||||
from .gguf_writer import GGUFWriter
|
||||
@@ -592,3 +612,262 @@ class LlamaHfVocab(Vocab):
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
class MistralTokenizerType(str, Enum):
|
||||
spm = "spm"
|
||||
tekken = "tekken"
|
||||
|
||||
|
||||
# Copied from Transformers (Apache 2.0)
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/convert_slow_tokenizer.py#L1544
|
||||
|
||||
def bytes_to_unicode() -> dict[int, str]:
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
|
||||
characters the bpe code barfs on.
|
||||
|
||||
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
|
||||
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
|
||||
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
|
||||
tables between utf-8 bytes and unicode strings.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs_str = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs_str))
|
||||
|
||||
|
||||
class MistralVocab(Vocab):
|
||||
tokenizer_model = "mistral"
|
||||
name = "mistral"
|
||||
|
||||
added_tokens_dict: dict[str, int] = {}
|
||||
added_tokens_list: list[str] = []
|
||||
|
||||
def __init__(self, base_path: Path):
|
||||
if not _mistral_common_installed:
|
||||
raise ImportError(
|
||||
"To use MistralVocab, please install the `mistral-common` package. "
|
||||
"You can install it with `pip install mistral-common`."
|
||||
)
|
||||
assert _filter_valid_tokenizer_files is not None, "mistral_common is not installed"
|
||||
assert MistralTokenizer is not None, "mistral_common is not installed"
|
||||
assert Tekkenizer is not None, "mistral_common is not installed"
|
||||
|
||||
logger.info(f"Loading Mistral tokenizer from {base_path}")
|
||||
|
||||
# Find the tokenizer files
|
||||
all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
|
||||
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)
|
||||
|
||||
if len(valid_tokenizer_files) == 0:
|
||||
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
|
||||
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
|
||||
if len(valid_tokenizer_files) > 1:
|
||||
if "tekken.json" in valid_tokenizer_files:
|
||||
tokenizer_file = "tekken.json"
|
||||
else:
|
||||
tokenizer_file = sorted(valid_tokenizer_files)[-1]
|
||||
logger.warning(
|
||||
f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
|
||||
)
|
||||
else:
|
||||
tokenizer_file = valid_tokenizer_files[0]
|
||||
|
||||
self.tokenizer = MistralTokenizer.from_file(
|
||||
base_path / tokenizer_file
|
||||
).instruct_tokenizer.tokenizer
|
||||
self.tokenizer_type = (
|
||||
MistralTokenizerType.tekken
|
||||
if isinstance(self.tokenizer, Tekkenizer)
|
||||
else MistralTokenizerType.spm
|
||||
)
|
||||
self.vocab_size = self.tokenizer.n_words
|
||||
self.fname_tokenizer = base_path / tokenizer_file
|
||||
self._name = (
|
||||
"mistral-" + self.tokenizer_type.value + "-" + self.tokenizer.version
|
||||
)
|
||||
|
||||
@property
|
||||
def tokenizer_name(self) -> str:
|
||||
return self._name
|
||||
|
||||
@property
|
||||
def gguf_tokenizer_model(self) -> str:
|
||||
return "llama" if self.tokenizer_type == MistralTokenizerType.spm else "gpt2"
|
||||
|
||||
def _sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
assert SentencePieceTokenizer is not None, "mistral_common is not installed"
|
||||
assert isinstance(self.tokenizer, SentencePieceTokenizer), (
|
||||
f"Expected SentencePieceTokenizer, got {type(self.tokenizer)}"
|
||||
)
|
||||
|
||||
for i in range(self.tokenizer._model.vocab_size()):
|
||||
piece = self.tokenizer._model.IdToPiece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score: float = self.tokenizer._model.GetScore(i)
|
||||
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
if self.tokenizer._model.IsUnknown(i):
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
if self.tokenizer._model.IsControl(i):
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
|
||||
if self.tokenizer._model.IsUnused(i):
|
||||
toktype = gguf.TokenType.UNUSED
|
||||
if self.tokenizer._model.IsByte(i):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
|
||||
yield text, score, toktype
|
||||
|
||||
def _tekken_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
assert Tekkenizer is not None, "mistral_common is not installed"
|
||||
assert isinstance(self.tokenizer, Tekkenizer), (
|
||||
f"Expected Tekkenizer, got {type(self.tokenizer)}"
|
||||
)
|
||||
|
||||
byte_encoder = bytes_to_unicode()
|
||||
for token_id in range(self.tokenizer.num_special_tokens):
|
||||
yield (
|
||||
self.tokenizer.id_to_piece(token_id).encode("utf-8"),
|
||||
0,
|
||||
gguf.TokenType.CONTROL
|
||||
)
|
||||
for token in self.tokenizer._tekken_token2id_nospecial:
|
||||
yield (
|
||||
self.token_bytes_to_string(token, byte_encoder).encode("utf-8"),
|
||||
0,
|
||||
gguf.TokenType.NORMAL,
|
||||
)
|
||||
|
||||
def get_token_id(self, token: str) -> int:
|
||||
assert SentencePieceTokenizer is not None and Tekkenizer is not None, "mistral_common is not installed"
|
||||
if self.tokenizer_type == MistralTokenizerType.spm:
|
||||
assert isinstance(self.tokenizer, SentencePieceTokenizer)
|
||||
return self.tokenizer._vocab.index(token)
|
||||
elif self.tokenizer_type == MistralTokenizerType.tekken:
|
||||
assert isinstance(self.tokenizer, Tekkenizer)
|
||||
return (
|
||||
self.tokenizer._vocab.index(token) + self.tokenizer.num_special_tokens
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown tokenizer type: {self.tokenizer_type}")
|
||||
|
||||
@property
|
||||
def bos_id(self) -> int:
|
||||
return self.tokenizer.bos_id
|
||||
|
||||
@property
|
||||
def eos_id(self) -> int:
|
||||
return self.tokenizer.eos_id
|
||||
|
||||
@property
|
||||
def pad_id(self) -> int:
|
||||
if self.tokenizer.pad_id == -1:
|
||||
return self.eos_id
|
||||
return self.tokenizer.pad_id
|
||||
|
||||
@property
|
||||
def unk_id(self) -> int:
|
||||
return self.tokenizer.unk_id
|
||||
|
||||
@property
|
||||
def bos_token(self) -> str:
|
||||
return self.tokenizer.id_to_piece(self.tokenizer.bos_id)
|
||||
|
||||
@property
|
||||
def eos_token(self) -> str:
|
||||
return self.tokenizer.id_to_piece(self.tokenizer.eos_id)
|
||||
|
||||
@property
|
||||
def pad_token(self) -> str:
|
||||
return self.tokenizer.id_to_piece(self.tokenizer.pad_id)
|
||||
|
||||
@property
|
||||
def unk_token(self) -> str:
|
||||
return self.tokenizer.id_to_piece(self.tokenizer.unk_id)
|
||||
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
if self.tokenizer_type == MistralTokenizerType.spm:
|
||||
yield from self._sentencepiece_tokens()
|
||||
|
||||
elif self.tokenizer_type == MistralTokenizerType.tekken:
|
||||
yield from self._tekken_tokens()
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown tokenizer type: {self.tokenizer_type}")
|
||||
|
||||
@staticmethod
|
||||
def token_bytes_to_string(b, byte_encoder):
|
||||
return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
|
||||
|
||||
def extract_vocab_merges_from_model(self):
|
||||
# Adapted from Transformers (Apache 2.0)
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/convert_slow_tokenizer.py
|
||||
assert Tekkenizer is not None and isinstance(self.tokenizer, Tekkenizer), (
|
||||
f"Expected Tekkenizer, got {type(self.tokenizer)}"
|
||||
)
|
||||
mergeable_ranks = self.tokenizer._model._mergeable_ranks
|
||||
token_bytes_map = {
|
||||
rank: token_bytes for token_bytes, rank in mergeable_ranks.items()
|
||||
}
|
||||
merge_pairs = []
|
||||
|
||||
# Sort vocab by rank to ensure correct merge order
|
||||
for i in range(256, self.vocab_size - self.tokenizer.num_special_tokens):
|
||||
merged_token = token_bytes_map[i]
|
||||
local = []
|
||||
for j in range(1, len(merged_token)):
|
||||
left = merged_token[:j]
|
||||
right = merged_token[j:]
|
||||
if (
|
||||
left in mergeable_ranks
|
||||
and right in mergeable_ranks
|
||||
and (left + right) in mergeable_ranks
|
||||
):
|
||||
local.append((left, right, i))
|
||||
if not local:
|
||||
raise ValueError(
|
||||
f"Could not find valid merge for token at rank {i}: {merged_token.decode('latin-1')}"
|
||||
)
|
||||
local = sorted(
|
||||
local,
|
||||
key=lambda x: (mergeable_ranks[x[0]], mergeable_ranks[x[1]]),
|
||||
reverse=False,
|
||||
)
|
||||
merge_pairs.extend(local)
|
||||
merge_pairs = sorted(merge_pairs, key=lambda val: val[2], reverse=False)
|
||||
|
||||
byte_encoder = bytes_to_unicode()
|
||||
|
||||
decoded_merge_pairs = [
|
||||
[
|
||||
self.token_bytes_to_string(val[0], byte_encoder),
|
||||
self.token_bytes_to_string(val[1], byte_encoder),
|
||||
]
|
||||
for val in merge_pairs
|
||||
]
|
||||
|
||||
merges = [
|
||||
" ".join(
|
||||
[
|
||||
# ensure the spaces are properly encoded
|
||||
"".join(chr(ord(c) + 256) if c == " " else c for c in part)
|
||||
for part in pair
|
||||
]
|
||||
)
|
||||
for pair in decoded_merge_pairs
|
||||
]
|
||||
|
||||
return merges
|
||||
|
||||
@@ -956,6 +956,7 @@ extern "C" {
|
||||
// in the order they have appeared in the batch.
|
||||
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
||||
// Cols: n_vocab
|
||||
// TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Logits for the ith token. For positive indices, Equivalent to:
|
||||
@@ -970,6 +971,7 @@ extern "C" {
|
||||
// in the order they have appeared in the batch.
|
||||
// shape: [n_outputs*n_embd]
|
||||
// Otherwise, returns NULL.
|
||||
// TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -1,3 +1,5 @@
|
||||
mistral-common>=1.8.3
|
||||
|
||||
-r ./requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch~=2.2.1; platform_machine != "s390x"
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
docstring_parser~=0.15
|
||||
pydantic~=2.6.3
|
||||
pydantic~=2.11.7
|
||||
requests
|
||||
|
||||
@@ -112,6 +112,11 @@ class DocsGenerator:
|
||||
lines.append("")
|
||||
lines.append("List of GGML operations and backend support status.")
|
||||
lines.append("")
|
||||
lines.append("## How to add a backend to this table:")
|
||||
lines.append("")
|
||||
lines.append("1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)")
|
||||
lines.append("2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`")
|
||||
lines.append("")
|
||||
lines.append("Legend:")
|
||||
lines.append("- ✅ Fully supported by this backend")
|
||||
lines.append("- 🟡 Partially supported by this backend")
|
||||
|
||||
@@ -1 +1 @@
|
||||
3323219cd3cc050e5c7133cd4fc1e50d1f590faf
|
||||
b7bfde9c88aa4b063ce68dab6cc4f5c6caae37fd
|
||||
|
||||
@@ -88,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
||||
{ LLM_ARCH_LFM2, "lfm2" },
|
||||
{ LLM_ARCH_DREAM, "dream" },
|
||||
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -1933,6 +1934,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_SMALLTHINKER,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DREAM,
|
||||
{
|
||||
|
||||
@@ -92,6 +92,7 @@ enum llm_arch {
|
||||
LLM_ARCH_SMOLLM3,
|
||||
LLM_ARCH_LFM2,
|
||||
LLM_ARCH_DREAM,
|
||||
LLM_ARCH_SMALLTHINKER,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
+51
-16
@@ -105,7 +105,7 @@ llama_context::llama_context(
|
||||
|
||||
{
|
||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||
const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
||||
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
||||
|
||||
if (!supports_set_rows && !cparams.kv_unified) {
|
||||
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
||||
@@ -298,7 +298,7 @@ llama_context::llama_context(
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
// reserve pp graph first so that buffers are only allocated once
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
if (!gf) {
|
||||
@@ -309,7 +309,7 @@ llama_context::llama_context(
|
||||
n_nodes_pp = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
|
||||
// reserve with tg graph to get the number of splits and nodes
|
||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||
{
|
||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
||||
if (!gf) {
|
||||
@@ -508,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
|
||||
}
|
||||
|
||||
float * llama_context::get_logits() {
|
||||
output_reorder();
|
||||
|
||||
return logits;
|
||||
}
|
||||
|
||||
float * llama_context::get_logits_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
if (logits == nullptr) {
|
||||
throw std::runtime_error("no logits");
|
||||
@@ -550,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings() {
|
||||
output_reorder();
|
||||
|
||||
return embd;
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
if (embd == nullptr) {
|
||||
throw std::runtime_error("no embeddings");
|
||||
@@ -891,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
// TODO: hacky solution
|
||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||
//cross.t_embd = t_embd;
|
||||
@@ -970,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
||||
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||
embd_seq.clear();
|
||||
output_swaps.clear();
|
||||
|
||||
bool did_optimize = false;
|
||||
|
||||
@@ -1189,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
@@ -1207,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
continue;
|
||||
}
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
||||
}
|
||||
}
|
||||
|
||||
// remember the swaps and apply them lazily upon logits/embeddings access
|
||||
output_swaps.push_back({ i, j_min });
|
||||
}
|
||||
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
@@ -1230,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||
//synchronize();
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1307,6 +1318,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
return n_outputs_max;
|
||||
}
|
||||
|
||||
void llama_context::output_reorder() {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
|
||||
for (uint32_t s = 0; s < output_swaps.size(); ++s) {
|
||||
const uint32_t i0 = output_swaps[s].i0;
|
||||
const uint32_t i1 = output_swaps[s].i1;
|
||||
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output_swaps.clear();
|
||||
}
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
|
||||
@@ -181,6 +181,8 @@ private:
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
uint32_t output_reserve(int32_t n_outputs);
|
||||
|
||||
void output_reorder();
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
@@ -250,6 +252,13 @@ private:
|
||||
|
||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||
|
||||
struct swap_info {
|
||||
uint32_t i0;
|
||||
uint32_t i1;
|
||||
};
|
||||
|
||||
std::vector<swap_info> output_swaps;
|
||||
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
@@ -278,6 +287,10 @@ private:
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
bool supports_set_rows = false;
|
||||
|
||||
// perf
|
||||
mutable int64_t t_start_us = 0;
|
||||
mutable int64_t t_load_us = 0;
|
||||
|
||||
@@ -938,6 +938,100 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
return moe_out;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * probs,
|
||||
ggml_tensor * up_exps,
|
||||
ggml_tensor * gate_exps,
|
||||
ggml_tensor * down_exps,
|
||||
ggml_tensor * exp_probs_b,
|
||||
int64_t n_expert,
|
||||
int64_t n_expert_used,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il) const {
|
||||
const int64_t n_embd = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
|
||||
// add experts selection bias - introduced in DeepSeek V3
|
||||
// leave probs unbiased as it's later used to get expert weights
|
||||
ggml_tensor * selection_probs = probs;
|
||||
if (exp_probs_b != nullptr) {
|
||||
selection_probs = ggml_add(ctx0, probs, exp_probs_b);
|
||||
cb(selection_probs, "ffn_moe_probs_biased", il);
|
||||
}
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||
cb(selected_experts, "ffn_moe_topk", il);
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
||||
cb(weights, "ffn_moe_weights", il);
|
||||
|
||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
||||
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
|
||||
weights = ggml_soft_max(ctx0, weights);
|
||||
} else {
|
||||
weights = ggml_sigmoid(ctx0, weights);
|
||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
|
||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
||||
|
||||
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
|
||||
cb(weights, "ffn_moe_weights_norm", il);
|
||||
}
|
||||
|
||||
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
||||
|
||||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(up, "ffn_moe_up", il);
|
||||
|
||||
ggml_tensor * experts = nullptr;
|
||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
|
||||
cur = ggml_reglu_split(ctx0, cur, up);
|
||||
cb(cur, "ffn_moe_reglu", il);
|
||||
|
||||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
cb(experts, "ffn_moe_down", il);
|
||||
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
|
||||
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
||||
|
||||
assert(n_expert_used > 0);
|
||||
|
||||
// order the views before the adds
|
||||
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
||||
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
|
||||
|
||||
ggml_build_forward_expand(gf, cur_experts[i]);
|
||||
}
|
||||
|
||||
// aggregate experts
|
||||
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
|
||||
// to avoid potentially a large number of add nodes during warmup
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
|
||||
ggml_tensor * moe_out = cur_experts[0];
|
||||
|
||||
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
||||
}
|
||||
|
||||
if (n_expert_used == 1) {
|
||||
// avoid returning a non-contiguous tensor
|
||||
moe_out = ggml_cont(ctx0, moe_out);
|
||||
}
|
||||
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
return moe_out;
|
||||
}
|
||||
|
||||
// input embeddings with optional lora
|
||||
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
@@ -625,6 +625,18 @@ struct llm_graph_context {
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il) const;
|
||||
|
||||
ggml_tensor * build_moe_ffn_from_probs(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * probs,
|
||||
ggml_tensor * up_exps,
|
||||
ggml_tensor * gate_exps,
|
||||
ggml_tensor * down_exps,
|
||||
ggml_tensor * exp_probs_b,
|
||||
int64_t n_expert,
|
||||
int64_t n_expert_used,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il) const;
|
||||
|
||||
//
|
||||
// inputs
|
||||
//
|
||||
|
||||
@@ -2,9 +2,15 @@
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
if (dense_first) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
|
||||
}
|
||||
} else {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+11
-4
@@ -98,7 +98,7 @@ struct llama_hparams {
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
std::array<int, 4> rope_sections;
|
||||
|
||||
@@ -140,7 +140,7 @@ struct llama_hparams {
|
||||
// for Classifiers
|
||||
uint32_t n_cls_out = 1;
|
||||
|
||||
// llama4
|
||||
// llama4 smallthinker
|
||||
uint32_t n_moe_layer_step = 0;
|
||||
uint32_t n_no_rope_layer_step = 4;
|
||||
uint32_t n_attn_temp_floor_scale = 8192;
|
||||
@@ -161,9 +161,10 @@ struct llama_hparams {
|
||||
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
||||
|
||||
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
||||
// dense_first means whether the pattern is start with a dense layer
|
||||
// note that if n_pattern == 0, all layers are SWA
|
||||
// if n_pattern == 1, all layers are dense
|
||||
// example: n_pattern = 3
|
||||
// example 1: n_pattern = 3, dense_first = false
|
||||
// il == 0: swa
|
||||
// il == 1: swa
|
||||
// il == 2: dense
|
||||
@@ -172,7 +173,13 @@ struct llama_hparams {
|
||||
// il == 5: dense
|
||||
// il == 6: swa
|
||||
// etc ...
|
||||
void set_swa_pattern(uint32_t n_pattern);
|
||||
// example 2: n_pattern = 2, dense_first = true
|
||||
// il == 0: dense
|
||||
// il == 1: swa
|
||||
// il == 2: dense
|
||||
// il == 3: swa
|
||||
// etc ...
|
||||
void set_swa_pattern(uint32_t n_pattern, bool dense_first = false);
|
||||
|
||||
// return true if one of the layers is SWA
|
||||
bool is_swa_any() const;
|
||||
|
||||
+195
-8
@@ -1369,7 +1369,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
// that have no expert_gating_func model parameter set
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
@@ -1768,6 +1768,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096;
|
||||
hparams.set_swa_pattern(4, true);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
hparams.n_no_rope_layer_step = hparams.n_layer;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 32: type = LLM_TYPE_4B; break;
|
||||
case 52: type = LLM_TYPE_20B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
@@ -5165,6 +5188,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
||||
|
||||
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
|
||||
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
|
||||
|
||||
// MoE branch
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -5490,6 +5549,11 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_SMALLTHINKER) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
}
|
||||
|
||||
vocab.print_info();
|
||||
}
|
||||
|
||||
@@ -16191,7 +16255,7 @@ private:
|
||||
{
|
||||
// PLaMo-2 uses combined QKV tensor
|
||||
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(qkv, "qkv", il);
|
||||
cb(qkv, "wqkv", il);
|
||||
|
||||
// split QKV tensor into Q, K, V
|
||||
const int64_t n_embd_head_q = hparams.n_embd_head_k;
|
||||
@@ -16231,7 +16295,7 @@ private:
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
|
||||
cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
|
||||
}
|
||||
|
||||
cb(cur, "attn_out", il);
|
||||
@@ -16306,8 +16370,9 @@ private:
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0, last_conv,
|
||||
ggml_view_1d(ctx0, conv_states_all,
|
||||
(d_conv - 1)*(d_inner)*(n_seqs),
|
||||
kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
|
||||
(d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
|
||||
kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
|
||||
cb(conv_states_all, "mamba_conv1d_state", il);
|
||||
|
||||
// 1D convolution
|
||||
x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
||||
@@ -16370,9 +16435,9 @@ private:
|
||||
// store last states
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0,
|
||||
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
|
||||
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
|
||||
kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
||||
ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
|
||||
ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
|
||||
cb(ssm_states_all, "mamba_ssm_states", il);
|
||||
|
||||
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
|
||||
cb(y, "mamba_y_view", il);
|
||||
@@ -17010,6 +17075,119 @@ struct llm_build_lfm2 : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
template <bool iswa>
|
||||
struct llm_build_smallthinker : public llm_graph_context{
|
||||
llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_unified_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv_unified();
|
||||
}
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
ggml_tensor * probs = nullptr;
|
||||
|
||||
probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
|
||||
cb(probs, "ffn_moe_logits", il);
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self_attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
probs = ggml_get_rows(ctx0, probs, inp_out_ids);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// MoE branch
|
||||
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
ggml_tensor * ffn_out = build_moe_ffn_from_probs(cur, probs, model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
nullptr, n_expert, n_expert_used,
|
||||
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
|
||||
|
||||
cb(ffn_out, "ffn_out", il);
|
||||
cur = ffn_out;
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
};
|
||||
|
||||
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
||||
llama_memory_i * res;
|
||||
|
||||
@@ -17448,6 +17626,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_lfm2>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
||||
llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -17646,6 +17832,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_DOTS1:
|
||||
case LLM_ARCH_HUNYUAN_MOE:
|
||||
case LLM_ARCH_LFM2:
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
|
||||
+58
-17
@@ -868,16 +868,30 @@ struct sql_printer : public printer {
|
||||
|
||||
struct csv_printer : public printer {
|
||||
void print_header() override {
|
||||
std::vector<std::string> fields = test_result::get_fields();
|
||||
|
||||
std::vector<std::string> fields = test_result::get_fields();
|
||||
std::vector<std::string> fields_csv = get_fields_csv();
|
||||
for (size_t i = 0; i < fields.size(); i++) {
|
||||
if (std::find(std::begin(fields_csv), std::end(fields_csv), fields[i]) == std::end(fields_csv)) {
|
||||
continue;
|
||||
}
|
||||
printf("\"%s\"%s", fields[i].c_str(), i < fields.size() - 1 ? "," : "");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void print_test_result(const test_result & result) override {
|
||||
std::vector<std::string> values = result.get_values();
|
||||
|
||||
std::vector<std::string> values = result.get_values();
|
||||
std::vector<std::string> fields = test_result::get_fields();
|
||||
std::vector<std::string> fields_csv = get_fields_csv();
|
||||
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
|
||||
if (std::find(std::begin(fields_csv), std::end(fields_csv), fields[i]) == std::end(fields_csv)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Escape quotes and wrap in quotes for CSV
|
||||
std::string escaped_value = values[i];
|
||||
size_t pos = 0;
|
||||
@@ -889,6 +903,19 @@ struct csv_printer : public printer {
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static std::vector<std::string> get_fields_csv() {
|
||||
return {
|
||||
"op_name",
|
||||
"op_params",
|
||||
"supported",
|
||||
"error_message",
|
||||
"test_mode",
|
||||
"backend_reg_name",
|
||||
"backend_name",
|
||||
};
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||
@@ -3707,6 +3734,7 @@ struct test_im2col : public test_case {
|
||||
struct test_conv_2d : public test_case {
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
const std::array<int64_t, 4> ne_kernel;
|
||||
const ggml_type type_kernel;
|
||||
const int stride0;
|
||||
const int stride1;
|
||||
const int padding0;
|
||||
@@ -3724,7 +3752,11 @@ struct test_conv_2d : public test_case {
|
||||
// IM2COL -> MUL_MM graph will be built.
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR9(ne_input, ne_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn);
|
||||
return VARS_TO_STR10(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-4;
|
||||
}
|
||||
|
||||
uint64_t op_flops(ggml_tensor * t) override {
|
||||
@@ -3755,10 +3787,11 @@ struct test_conv_2d : public test_case {
|
||||
}
|
||||
|
||||
test_conv_2d(std::array<int64_t, 4> ne_input = { 64, 64, 16, 1 },
|
||||
std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, int stride0 = 1, int stride1 = 1, int padding0 = 0,
|
||||
int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) :
|
||||
std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, ggml_type type_kernel = GGML_TYPE_F32, int stride0 = 1,
|
||||
int stride1 = 1, int padding0 = 0, int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) :
|
||||
ne_input(ne_input),
|
||||
ne_kernel(ne_kernel),
|
||||
type_kernel(type_kernel),
|
||||
stride0(stride0),
|
||||
stride1(stride1),
|
||||
padding0(padding0),
|
||||
@@ -3771,7 +3804,7 @@ struct test_conv_2d : public test_case {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
||||
ggml_set_name(input, "input");
|
||||
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
|
||||
ggml_set_name(kernel, "kernel");
|
||||
|
||||
if (cwhn) {
|
||||
@@ -5138,10 +5171,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
{ 16, 3, 256, 128, 8 }
|
||||
};
|
||||
|
||||
for (auto act_case : cases) {
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false));
|
||||
for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
for (auto act_case : cases) {
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] },
|
||||
kernel_type, 1, 1, 0, 0, 1, 1, false));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -5167,8 +5203,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (uint32_t W : { 1, 141 }) {
|
||||
if (calc_conv_output_size(W, KW, s0, p0, d0) > 0 &&
|
||||
calc_conv_output_size(H, KH, s1, p1, d1) > 0) {
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ W, H, Cin, 2 }, { KW, KH, Cin, Cout }, s0, s1, p0, p1, d0, d1, false));
|
||||
for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ W, H, Cin, 2 }, { KW, KH, Cin, Cout }, kernel_type, s0, s1, p0, p1, d0, d1, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5813,11 +5851,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
{ 16, 3, 512, 128, 8 },
|
||||
};
|
||||
|
||||
for (auto act_case : cases) {
|
||||
// Direct CONV_2D
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false));
|
||||
for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
for (auto act_case : cases) {
|
||||
// Direct CONV_2D
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] },
|
||||
kernel_type, 1, 1, 0, 0, 1, 1, false));
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
|
||||
@@ -148,7 +148,7 @@ struct lora_merge_ctx {
|
||||
|
||||
ctx_out = gguf_init_empty();
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
|
||||
/*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead()),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
@@ -131,6 +131,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -150,6 +151,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
+76
-40
@@ -354,6 +354,16 @@ struct clip_model {
|
||||
ggml_tensor * conv1d_2_b = nullptr;
|
||||
ggml_tensor * mm_norm_pre_w = nullptr;
|
||||
ggml_tensor * mm_norm_mid_w = nullptr;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
|
||||
bool audio_has_stack_frames() const {
|
||||
return proj_type == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
@@ -1483,49 +1493,52 @@ struct clip_graph {
|
||||
|
||||
cb(cur, "after_transformer", -1);
|
||||
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
|
||||
if (model.audio_has_stack_frames()) {
|
||||
// StackAudioFrames
|
||||
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
|
||||
{
|
||||
int64_t stride = n_embd * hparams.proj_stack_factor;
|
||||
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
|
||||
int64_t pad = padded_len - ggml_nelements(cur);
|
||||
if (pad > 0) {
|
||||
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
|
||||
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
||||
}
|
||||
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
||||
ggml_row_size(cur->type, stride), 0);
|
||||
int64_t stride = n_embd * hparams.proj_stack_factor;
|
||||
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
|
||||
int64_t pad = padded_len - ggml_nelements(cur);
|
||||
if (pad > 0) {
|
||||
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
|
||||
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
||||
}
|
||||
|
||||
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
||||
ggml_row_size(cur->type, stride), 0);
|
||||
cb(cur, "after_stacked", -1);
|
||||
}
|
||||
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
|
||||
// UltravoxProjector
|
||||
{
|
||||
// pre-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
// pre-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
|
||||
// ffn in
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
// ffn in
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
|
||||
// swiglu
|
||||
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
||||
cur = ggml_swiglu_swapped(ctx0, cur);
|
||||
// swiglu
|
||||
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
||||
cur = ggml_swiglu_swapped(ctx0, cur);
|
||||
|
||||
// mid-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
|
||||
// mid-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
|
||||
|
||||
// ffn out
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
}
|
||||
// ffn out
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
|
||||
// projector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
|
||||
// projector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
|
||||
} else {
|
||||
GGML_ABORT("%s: unknown projector type", __func__);
|
||||
}
|
||||
@@ -1670,8 +1683,7 @@ private:
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// TODO @ngxson : find a way to move this outside
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
|
||||
if (ctx->model.audio_has_avgpool()) {
|
||||
ggml_tensor * cur = inpL;
|
||||
cur = ggml_transpose(ctx0, cur);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
@@ -1985,6 +1997,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
res = graph.build_llama4();
|
||||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
{
|
||||
res = graph.build_whisper_enc();
|
||||
@@ -2259,8 +2272,10 @@ struct clip_model_loader {
|
||||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX;
|
||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
||||
if (hparams.n_mel_bins != 128) {
|
||||
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
||||
@@ -2315,7 +2330,7 @@ struct clip_model_loader {
|
||||
|
||||
// create data context
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ (gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
|
||||
/*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
@@ -2544,6 +2559,15 @@ struct clip_model_loader {
|
||||
model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
|
||||
model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
||||
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
||||
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
||||
@@ -3570,17 +3594,26 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
int scale_factor = ctx->model.hparams.proj_scale_factor;
|
||||
n_patches_sq /= (scale_factor * scale_factor);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
{
|
||||
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
|
||||
const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
|
||||
n_patches_sq = n_len / proj_stack_factor / 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
{
|
||||
// divide by 2 because of whisper
|
||||
// another divide by 2 because of nn.AvgPool1d(2, stride=2)
|
||||
n_patches_sq = img->nx / 4;
|
||||
n_patches_sq = img->nx;
|
||||
|
||||
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
|
||||
if (ctx->model.audio_has_stack_frames()) {
|
||||
GGML_ASSERT(proj_stack_factor > 0);
|
||||
const int n_len = CLIP_ALIGN(n_patches_sq, proj_stack_factor);
|
||||
n_patches_sq = n_len / proj_stack_factor;
|
||||
}
|
||||
|
||||
// whisper downscales input token by half after conv1d
|
||||
n_patches_sq /= 2;
|
||||
|
||||
if (ctx->model.audio_has_avgpool()) {
|
||||
// divide by 2 because of nn.AvgPool1d(2, stride=2)
|
||||
n_patches_sq /= 2;
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported projector type");
|
||||
@@ -3986,6 +4019,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
// do nothing
|
||||
} break;
|
||||
@@ -4086,6 +4120,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->model.projection->ne[1];
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
return ctx->model.mm_3_w->ne[1];
|
||||
@@ -4132,7 +4167,8 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
||||
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A;
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
|
||||
@@ -289,6 +289,10 @@ struct mtmd_context {
|
||||
aud_beg = "<|audio_bos|>";
|
||||
aud_end = "<|audio_eos|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
|
||||
// [BEGIN_AUDIO] ... (embeddings) ...
|
||||
aud_beg = "[BEGIN_AUDIO]";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
-r ../../requirements/requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pillow~=10.2.0
|
||||
pillow~=11.3.0
|
||||
torch~=2.2.1
|
||||
torchvision~=0.17.1
|
||||
|
||||
@@ -71,6 +71,7 @@ add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||
add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
|
||||
|
||||
# to test the big models, run: ./tests.sh big
|
||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
|
||||
+113
-71
@@ -1,18 +1,25 @@
|
||||
# quantize
|
||||
|
||||
This tool takes a GGUF input model file, typically in a high-precision format like F32 or BF16, and converts it to a quantized format.
|
||||
Quantization reduces the precision of model weights (e.g., from 32-bit floats to 4-bit integers), which shrinks the model's size and can speed up inference.
|
||||
This process however, may introduce some accuracy loss which is usually measured in [Perplexity](https://huggingface.co/docs/transformers/en/perplexity) (ppl) and/or [Kullback–Leibler Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) (kld).
|
||||
This can be minimized by using a suitable imatrix file.
|
||||
|
||||
You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
|
||||
|
||||
Note: It is synced from llama.cpp `main` every 6 hours.
|
||||
|
||||
Example usage:
|
||||
|
||||
```./llama-quantize [options] input-model-f32.gguf [output-model-quant.gguf] type [threads]```
|
||||
|
||||
```bash
|
||||
# obtain the official LLaMA model weights and place them in ./models
|
||||
# from Hugginface, obtain the official meta-llama/Llama-3.1-8B model weights and place them in ./models
|
||||
ls ./models
|
||||
llama-2-7b tokenizer_checklist.chk tokenizer.model
|
||||
# [Optional] for models using BPE tokenizers
|
||||
ls ./models
|
||||
<folder containing weights and tokenizer json> vocab.json
|
||||
config.json model-00001-of-00004.safetensors model-00004-of-00004.safetensors README.md tokenizer.json
|
||||
generation_config.json model-00002-of-00004.safetensors model.safetensors.index.json special_tokens_map.json USE_POLICY.md
|
||||
LICENSE model-00003-of-00004.safetensors original tokenizer_config.json
|
||||
|
||||
# [Optional] for PyTorch .bin models like Mistral-7B
|
||||
ls ./models
|
||||
<folder containing weights and tokenizer json>
|
||||
@@ -21,7 +28,7 @@ ls ./models
|
||||
python3 -m pip install -r requirements.txt
|
||||
|
||||
# convert the model to ggml FP16 format
|
||||
python3 convert_hf_to_gguf.py models/mymodel/
|
||||
python3 convert_hf_to_gguf.py ./models/mymodel/
|
||||
|
||||
# quantize the model to 4-bits (using Q4_K_M method)
|
||||
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
@@ -37,40 +44,117 @@ Run the quantized model:
|
||||
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
|
||||
```
|
||||
|
||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||
Options:
|
||||
* `--allow-requantize` allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit
|
||||
* `--leave-output-tensor` will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
|
||||
* `--pure` disables k-quant mixtures and quantizes all tensors to the same type
|
||||
* `--imatrix` uses data in file generated by `llama-imatrix` as importance matrix for quant optimizations (highly recommended)
|
||||
* `--include-weights` use an importance matrix for tensor(s) in the list. Cannot be used with `--exclude-weights`
|
||||
* `--exclude-weights` use an importance matrix for tensor(s) in the list. Cannot be used with `--include-weights`
|
||||
* `--output-tensor-type` use a specific quant type for the output.weight tensor
|
||||
* `--token-embedding-type` use a specific quant type for the token embeddings tensor
|
||||
* `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file
|
||||
|
||||
Advanced options:
|
||||
* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times.
|
||||
* `--prune-layers` prune (remove) the layers in the list
|
||||
* `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# naive Q4_K_M quantization using default settings and 8 CPU threads. Output will be "ggml-model-Q4_K_M.gguf"
|
||||
./llama-quantize input-model-f32.gguf q4_k_m 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# quantize model enabling re-quantization, leaving the output tensor unquantized and all others quantized at the same level (Q4_K)
|
||||
./llama-quantize --allow-requantize --leave-output-tensor --pure input-model-f32.gguf q4_k_m 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# quantize model using an importance matrix for specified tensors only (attn_v and ffn_down)
|
||||
./llama-quantize --imatrix imatrix.gguf --include-weights attn_v --include-weights ffn_down input-model-f32.gguf q4_k_m 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# quantize model setting output tensor to Q5_K_M, token embeddings to Q3_K_M, and keeping the input file's shards
|
||||
./llama-quantize --imatrix imatrix.gguf --output-tensor-type q5_k --token-embedding-type q3_k --keep-split input-model-f32.gguf q4_k_m 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# quantize model using a regex to quantize attn_k tensors in odd layers to Q5_K_M and attn_q tensors in even layers to Q3_K_M
|
||||
./llama-quantize --imatrix imatrix.gguf --tensor-type "\.(\d*[13579])\.attn_k=q5_k" --tensor-type "\.(\d*[02468])\.attn_q=q3_k" input-model-f32.gguf q4_k_m 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# quantize model setting tensors attn_v and ffn_down to Q5_K_M and pruning layers 20, 21, and 22
|
||||
./llama-quantize --imatrix imatrix.gguf --tensor-type attn_v=q5_k --tensor-type ffn_down=q5_k --prune-layers 20,21,22 input-model-f32.gguf q4_k_m 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# override expert used count metadata to 16, prune layers 20, 21, and 22 without quantizing the model (copy tensors) and use specified name for the output file
|
||||
./llama-quantize --imatrix imatrix.gguf --override-kv qwen3moe.expert_used_count=int:16 --prune-layers 20,21,22 input-model-f32.gguf pruned-model-f32.gguf copy 8
|
||||
```
|
||||
|
||||
## Memory/Disk Requirements
|
||||
|
||||
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For exmaple (Llama 3.1):
|
||||
|
||||
| Model | Original size | Quantized size (Q4_K_M) |
|
||||
| ----: | ------------: | ----------------------: |
|
||||
| 8B | 32.1 GB | 4.9 GB |
|
||||
| 70B | 280.9 GB | 43.1 GB |
|
||||
| 405B | 1,625.1 GB | 249.1 GB |
|
||||
|
||||
| Model | Original size | Quantized size (Q4_0) |
|
||||
|------:|--------------:|----------------------:|
|
||||
| 7B | 13 GB | 3.9 GB |
|
||||
| 13B | 24 GB | 7.8 GB |
|
||||
| 30B | 60 GB | 19.5 GB |
|
||||
| 65B | 120 GB | 38.5 GB |
|
||||
|
||||
## Quantization
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed. For example,
|
||||
|
||||
*(outdated)*
|
||||
### [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
||||
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
|
||||
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
|
||||
| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 |
|
||||
| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 |
|
||||
| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
|
||||
| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
|
||||
| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G |
|
||||
| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 |
|
||||
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
|
||||
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
|
||||
| Measure | IQ1_S | IQ1_M | IQ2_XXS | IQ2_XS | IQ2_S | IQ2_M |
|
||||
| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
|
||||
| bits/weight | 2.0042 | 2.1460 | 2.3824 | 2.5882 | 2.7403 | 2.9294 |
|
||||
| size (GiB) | 1.87 | 2.01 | 2.23 | 2.42 | 2.56 | 2.74 |
|
||||
| prompt processing t/s @ 512 | 858.88 ±1.22 | 847.99 ±0.47 | 852.39 ±0.85 | 826.99 ±12.51 | 783.55 ±13.73 | 787.68 ±7.00 |
|
||||
| text generation t/s @ 128 | 79.73 ±0.79 | 72.92 ±0.14 | 79.86 ±0.22 | 78.04 ±0.46 | 77.30 ±2.47 | 74.44 ±0.15 |
|
||||
|
||||
| Measure | IQ3_XXS | IQ3_XS | IQ3_S | IQ3_M | IQ4_XS | IQ4_NL |
|
||||
| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
|
||||
| bits/weight | 3.2548 | 3.4977 | 3.6606 | 3.7628 | 4.4597 | 4.6818 |
|
||||
| size (GiB) | 3.04 | 3.27 | 3.42 | 3.52 | 4.17 | 4.38 |
|
||||
| prompt processing t/s @ 512 | 813.88 ±6.53 | 708.71 ±1.26 | 798.78 ±8.81 | 768.70 ±13.73 | 771.80 ±11.38 | 806.03 ±7.07 |
|
||||
| text generation t/s @ 128 | 73.95 ±0.20 | 71.67 ±0.54 | 69.31 ±0.63 | 70.15 ±0.33 | 77.51 ±0.20 | 76.63 ±0.28 |
|
||||
|
||||
|
||||
| Measure | Q2_K_S | Q2_K | Q3_K_S | Q3_K_M | Q3_K_L | Q4_K_S |
|
||||
| --------------------------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
|
||||
| bits/weight | 2.9697 | 3.1593 | 3.6429 | 3.9960 | 4.2979 | 4.6672 |
|
||||
| size (GiB) | 2.78 | 2.95 | 3.41 | 3.74 | 4.02 | 4.36 |
|
||||
| prompt processing t/s @ 512 | 798.91 ±6.40 | 784.45 ±7.85 | 752.17 ±7.94 | 783.44 ±9.92 | 761.17 ±7.55 | 818.55 ±9.58 |
|
||||
| text generation t/s @ 128 | 90.01 ±0.12 | 79.85 ±0.20 | 69.84 ±0.18 | 71.68 ±0.22 | 69.38 ±0.49 | 76.71 ±0.20 |
|
||||
|
||||
| Measure | Q4_K_S | Q4_K_M | Q5_K_S | Q5_K_M | Q6_K | Q8_0 |
|
||||
| --------------------------- | ------------ | ------------- | ------------ | ------------ | ------------- | ------------ |
|
||||
| bits/weight | 4.6672 | 4.8944 | 5.5704 | 5.7036 | 6.5633 | 8.5008 |
|
||||
| size (GiB) | 4.36 | 4.58 | 5.21 | 5.33 | 6.14 | 7.95 |
|
||||
| prompt processing t/s @ 512 | 818.55 ±9.58 | 821.81 ±21.44 | 752.52 ±0.99 | 758.69 ±7.43 | 812.01 ±10.82 | 865.09 ±8.30 |
|
||||
| text generation t/s @ 128 | 76.71 ±0.20 | 71.93 ±1.52 | 69.53 ±0.18 | 67.23 ±1.08 | 58.67 ±3.13 | 50.93 ±0.08 |
|
||||
|
||||
| Measure | F16 |
|
||||
| --------------------------- | ------------ |
|
||||
| bits/weight | 16.0005 |
|
||||
| size (GiB) | 14.96 |
|
||||
| prompt processing t/s @ 512 | 923.49 ±0.53 |
|
||||
| text generation t/s @ 128 | 29.17 ±0.04 |
|
||||
|
||||
## Background information on llama-quantize
|
||||
|
||||
- [k-quants](https://github.com/ggml-org/llama.cpp/pull/1684)
|
||||
- recent k-quants improvements and new i-quants
|
||||
- k-quants improvements and i-quants
|
||||
- [#2707](https://github.com/ggml-org/llama.cpp/pull/2707)
|
||||
- [#2807](https://github.com/ggml-org/llama.cpp/pull/2807)
|
||||
- [#4773 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4773)
|
||||
@@ -85,45 +169,3 @@ Several quantization methods are supported. They differ in the resulting model d
|
||||
- [#5060 - Q3_K_XS](https://github.com/ggml-org/llama.cpp/pull/5060)
|
||||
- [#5196 - 3-bit i-quants](https://github.com/ggml-org/llama.cpp/pull/5196)
|
||||
- [quantization tuning](https://github.com/ggml-org/llama.cpp/pull/5320), [another one](https://github.com/ggml-org/llama.cpp/pull/5334), and [another one](https://github.com/ggml-org/llama.cpp/pull/5361)
|
||||
|
||||
**Llama 2 7B**
|
||||
|
||||
| Quantization | Bits per Weight (BPW) |
|
||||
|--------------|-----------------------|
|
||||
| Q2_K | 3.35 |
|
||||
| Q3_K_S | 3.50 |
|
||||
| Q3_K_M | 3.91 |
|
||||
| Q3_K_L | 4.27 |
|
||||
| Q4_K_S | 4.58 |
|
||||
| Q4_K_M | 4.84 |
|
||||
| Q5_K_S | 5.52 |
|
||||
| Q5_K_M | 5.68 |
|
||||
| Q6_K | 6.56 |
|
||||
|
||||
**Llama 2 13B**
|
||||
|
||||
Quantization | Bits per Weight (BPW)
|
||||
-- | --
|
||||
Q2_K | 3.34
|
||||
Q3_K_S | 3.48
|
||||
Q3_K_M | 3.89
|
||||
Q3_K_L | 4.26
|
||||
Q4_K_S | 4.56
|
||||
Q4_K_M | 4.83
|
||||
Q5_K_S | 5.51
|
||||
Q5_K_M | 5.67
|
||||
Q6_K | 6.56
|
||||
|
||||
**Llama 2 70B**
|
||||
|
||||
Quantization | Bits per Weight (BPW)
|
||||
-- | --
|
||||
Q2_K | 3.40
|
||||
Q3_K_S | 3.47
|
||||
Q3_K_M | 3.85
|
||||
Q3_K_L | 4.19
|
||||
Q4_K_S | 4.53
|
||||
Q4_K_M | 4.80
|
||||
Q5_K_S | 5.50
|
||||
Q5_K_M | 5.65
|
||||
Q6_K | 6.56
|
||||
|
||||
Reference in New Issue
Block a user