Compare commits

...

6 Commits

Author SHA1 Message Date
Aman Gupta 684b36101c ggml-cpu: FA add GEMM microkernel (#19422)
* ggml-cpu: FA add GEMM microkernel

* add guard for sizeless vector types

* fix case where DV % GGML_F32_EPR !=0

* move memset out of the loop

* move another memset out of the loop

* use RM=4 for arm

* simd_gemm: convert everything to int

* convert everything to size_t to avoid warnings

* fixup

* add pragma for ignoring aggressive loop optimizations
2026-02-15 11:09:24 +05:30
SamareshSingh 3a00c98584 cmake : fix KleidiAI install target failure with EXCLUDE_FROM_ALL (#19581)
* cmake: fix KleidiAI install target failure with EXCLUDE_FROM_ALL

Fix for the bug #19501 by adding EXCLUDE_FROM_ALL to FetchContent_Declare. This properly excludes KleidiAI from both build and install targets, preventing install failures when GGML_CPU_KLEIDIAI=ON is used.

The KleidiAI source files are still compiled into libggml-cpu.so, preserving all functionality.

* addressed code review comments
2026-02-15 06:22:53 +01:00
Sigbjørn Skjæret 079feab9e3 convert : ensure all models handle new experts count (#19621)
* ensure all models handle new experts count

* revert removal for PhiMoeModel, does not inherit from base
2026-02-14 22:22:32 +01:00
Anav Prasad 01d8eaa28d mtmd : Add Nemotron Nano 12B v2 VL support (#19547)
* nemotron nano v2 vlm support added

* simplified code; addressed reviews

* pre-downsample position embeddings during GGUF conversion for fixed input size
2026-02-14 14:07:00 +01:00
Georgi Gerganov 1725e316c1 models : optimize qwen3next graph (#19375)
* models : optimizing qwen3next graph

* cont

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* cont : remove redundant q, g chunking

* minor

* minor

* avoid passing masks around

* avoid concats during chunking

* naming + shapes

* update names and use prefix to disable CUDA graphs
2026-02-14 12:57:36 +02:00
Adrien Gallouët b7742cf321 ggml : fix GGML_DEBUG with OpenMP (#19599)
last_graph is only available without OpenMP, but
ggml_graph_compute_thread() is called in both cases.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-02-14 11:22:57 +01:00
18 changed files with 681 additions and 417 deletions
+117 -50
View File
@@ -2726,8 +2726,6 @@ class AfmoeModel(LlamaModel):
super().set_gguf_parameters()
# MoE parameters
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None:
self.gguf_writer.add_expert_shared_count(n_shared_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
@@ -2749,7 +2747,7 @@ class AfmoeModel(LlamaModel):
# Handle expert weights - they're already merged in the HF format
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -4074,6 +4072,87 @@ class InternVisionModel(MmprojModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register(
"NemotronH_Nano_VL_V2",
"RADIOModel",
)
class NemotronNanoV2VLModel(MmprojModel):
# ViT-Huge architecture parameters for RADIO v2.5-h
_vit_hidden_size = 1280
_vit_intermediate_size = 5120
_vit_num_layers = 32
_vit_num_heads = 16
def get_vision_config(self) -> dict[str, Any] | None:
# RADIO config doesn't have standard ViT parameters, so they need to be constructed manually
vision_config = self.global_config.get("vision_config")
if vision_config is None:
return None
# Add ViT-H parameters
vision_config = {
**vision_config,
"hidden_size": self._vit_hidden_size,
"intermediate_size": self._vit_intermediate_size,
"num_hidden_layers": self._vit_num_layers,
"num_attention_heads": self._vit_num_heads,
"image_size": self.global_config.get("force_image_size", 512),
}
return vision_config
def set_gguf_parameters(self):
if "image_mean" not in self.preprocessor_config:
self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
if "image_std" not in self.preprocessor_config:
self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225]
super().set_gguf_parameters()
hparams = self.global_config
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
self.gguf_writer.add_vision_attention_layernorm_eps(1e-6)
self.gguf_writer.add_vision_use_gelu(True)
downsample_ratio = hparams.get("downsample_ratio", 0.5)
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name or "pos_embed" in new_name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "input_conditioner" in name:
return
# RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
if "patch_generator.pos_embed" in name:
if not name.endswith(".weight"):
name += ".weight"
# Downsample position embeddings for fixed 512x512 image size
import torch.nn.functional as F
n_embd = self.hparams["hidden_size"]
image_size = self.global_config.get("force_image_size", 512)
patch_size = self.hparams["patch_size"]
target_patches_per_side = image_size // patch_size # 32
max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128
if target_patches_per_side != max_patches_per_side:
# Reshape to grid, interpolate, flatten back
data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd)
data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128]
data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side),
mode='bilinear', align_corners=True)
data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd]
data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd)
# Reshape linear patch embedding to conv2d format for ggml_conv_2d
# From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size]
if "patch_generator.embedder" in name:
patch_size = self.hparams["patch_size"]
n_embd = self.hparams["hidden_size"]
data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)
if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("WavTokenizerDec")
class WavTokenizerDecModel(TextModel):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -4116,8 +4195,6 @@ class Qwen2MoeModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
@@ -4162,7 +4239,7 @@ class Qwen2MoeModel(TextModel):
return
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -4913,13 +4990,13 @@ class PhiMoeModel(Phi3MiniModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"]))
self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"]))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.hparams["num_local_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -5331,7 +5408,7 @@ class KimiLinearModel(TextModel):
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=False)
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -5926,12 +6003,13 @@ class NomicBertModel(BertModel):
if "mlp.experts.bias" in name:
return # Explicitly return.
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
if "mlp.experts.mlp.w1" in name:
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"])
name += ".weight"
if "mlp.experts.mlp.w2" in name:
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"])
data_torch = data_torch.transpose(1, 2)
name += ".weight"
@@ -5941,7 +6019,6 @@ class NomicBertModel(BertModel):
super().set_gguf_parameters()
if self.is_moe:
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
def _is_tokenizer_xlmroberta(self) -> bool:
@@ -7055,6 +7132,8 @@ class Mamba2Model(TextModel):
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if "llm_config" in hparams:
hparams["text_config"] = hparams["llm_config"]
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
@@ -7176,8 +7255,8 @@ class JambaModel(TextModel):
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"]))
self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"]))
self.gguf_writer.add_file_type(self.ftype)
_experts: list[dict[str, Tensor]] | None = None
@@ -7195,7 +7274,7 @@ class JambaModel(TextModel):
# process the experts separately
if ".feed_forward.experts." in name:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
@@ -7343,8 +7422,6 @@ class OlmoeModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
_experts: list[dict[str, Tensor]] | None = None
@@ -7352,7 +7429,7 @@ class OlmoeModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -7933,10 +8010,6 @@ class MiniMaxM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MINIMAXM2
_experts_cache: dict[int, dict[str, Tensor]] = {}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hparams["num_experts"] = self.hparams["num_local_experts"]
def set_gguf_parameters(self):
super().set_gguf_parameters()
@@ -7949,7 +8022,7 @@ class MiniMaxM2Model(TextModel):
# merge expert weights
if 'experts' in name:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
expert_cache = self._experts_cache.setdefault(bid, {})
@@ -9154,7 +9227,6 @@ class ExaoneMoEModel(Exaone4Model):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
moe_intermediate_size = self.hparams["moe_intermediate_size"]
num_shared_experts = self.hparams["num_shared_experts"]
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
@@ -9195,7 +9267,7 @@ class ExaoneMoEModel(Exaone4Model):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
if name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -9346,7 +9418,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
# case, the model architecture needs to be updated to a standard
# "granite" or "granitemoe" model
if not self._ssm_layers:
has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
has_experts = self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)
new_arch = (
gguf.MODEL_ARCH.GRANITE_MOE
if has_experts else
@@ -9542,6 +9614,14 @@ class NemotronHModel(GraniteHybridModel):
self.gguf_writer.add_add_bos_token(True)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision model and projector tensors for VLM models (handled by mmproj) (e.g., Nemotron Nano 12B v2 VL)
if name.startswith(("vision_model.", "mlp1.")):
return
# Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
if name.startswith("language_model."):
name = name[len("language_model."):]
if self.is_moe and bid is not None:
if name.endswith("mixer.gate.e_score_correction_bias"):
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@@ -9636,7 +9716,6 @@ class BailingMoeModel(TextModel):
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_weights_scale(1.0)
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
@@ -9670,7 +9749,7 @@ class BailingMoeModel(TextModel):
yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
return
elif name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -9741,7 +9820,6 @@ class BailingMoeV2Model(TextModel):
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"]))
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
@@ -9752,7 +9830,7 @@ class BailingMoeV2Model(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "mlp.experts" in name:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -9798,8 +9876,6 @@ class GroveMoeModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
@@ -9820,7 +9896,7 @@ class GroveMoeModel(TextModel):
# process the experts separately
if name.find("chunk_experts") != -1:
n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
n_experts = self.find_hparam(["num_local_experts", "num_experts"]) // 2 # see add_experts_per_group
assert bid is not None
if self._chunk_experts is None:
@@ -9847,7 +9923,7 @@ class GroveMoeModel(TextModel):
else:
return
elif name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -10240,7 +10316,6 @@ class HunYuanMoEModel(TextModel):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
moe_intermediate_size = hparams["moe_intermediate_size"]
@@ -10283,7 +10358,7 @@ class HunYuanMoEModel(TextModel):
return
if name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -10325,16 +10400,9 @@ class LLaDAMoEModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
# number of experts used per token (top-k)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
self.gguf_writer.add_mask_token_id(156895)
self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_diffusion_shift_logits(False)
@@ -10345,7 +10413,7 @@ class LLaDAMoEModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -10682,7 +10750,6 @@ class LFM2MoeModel(TextModel):
super().set_gguf_parameters()
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
@@ -10703,7 +10770,7 @@ class LFM2MoeModel(TextModel):
# merge expert weights
if 'experts' in name:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
expert_cache = self._experts_cache.setdefault(bid, {})
@@ -10813,9 +10880,9 @@ class SmallThinkerModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
if (n_experts := self.hparams.get("moe_num_primary_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
if (n_experts_used := self.hparams.get("moe_num_active_primary_experts")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
@@ -10840,7 +10907,7 @@ class SmallThinkerModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
n_experts = self.hparams.get("moe_num_primary_experts") or self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
+3 -6
View File
@@ -569,12 +569,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
cmake_policy(SET CMP0135 NEW)
endif()
# TODO: Use FetchContent_MakeAvailable with EXCLUDE_FROM_ALL after bumping minimum CMake version to 3.28+
# Using FetchContent_Populate instead to avoid EXCLUDE_FROM_ALL which requires CMake 3.28
FetchContent_Declare(KleidiAI_Download
URL ${KLEIDIAI_DOWNLOAD_URL}
DOWNLOAD_EXTRACT_TIMESTAMP NEW
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
FetchContent_MakeAvailable(KleidiAI_Download)
FetchContent_Populate(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download
SOURCE_DIR KLEIDIAI_SRC
POPULATED KLEIDIAI_POPULATED)
@@ -585,11 +587,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
# Remove kleidiai target after fetching it
if (TARGET kleidiai)
set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif()
list(APPEND GGML_CPU_SOURCES
ggml-cpu/kleidiai/kleidiai.cpp
ggml-cpu/kleidiai/kernels.cpp
+2 -2
View File
@@ -6,8 +6,8 @@
#include "ggml-impl.h"
#include "simd-mappings.h"
#define GGML_FA_TILE_Q 32
#define GGML_FA_TILE_KV 16
#define GGML_FA_TILE_Q 64
#define GGML_FA_TILE_KV 64
#ifdef __cplusplus
+12 -4
View File
@@ -2874,8 +2874,8 @@ struct ggml_cplan ggml_graph_plan(
const int64_t DV = node->src[2]->ne[0];
// Tiled flash attention scratch (tile sizes defined in common.h)
// Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding
size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
// Per-thread: Q_q + KQ + mask + VKQ32 + V32 + K_f32 + padding
size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV + GGML_FA_TILE_KV*DK)*n_tasks;
// Decode path: n_kv_chunks = n_tasks (one chunk per thread)
// Per-thread: VKQ accmulator (DV), partial M, partial S + intra-thread scratch for V, Q and VKQ
@@ -2947,7 +2947,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/*.use_ref =*/ cplan->use_ref,
};
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
#ifdef GGML_USE_OPENMP
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p\n", state->ith, (const void *)cplan);
#else
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
#endif
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];
@@ -2974,7 +2978,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
}
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
#ifdef GGML_USE_OPENMP
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p\n", state->ith, (const void *)cplan);
#else
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
#endif
ggml_barrier(state->threadpool);
+69 -54
View File
@@ -3,6 +3,7 @@
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include "binary-ops.h"
#include "simd-gemm.h"
#include "ggml.h"
#include "unary-ops.h"
#include "vec.h"
@@ -8389,10 +8390,6 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
GGML_ASSERT(k->type == v->type);
const ggml_type kv_type = k->type;
const auto * kv_type_traits_cpu = ggml_get_type_traits_cpu(kv_type);
const ggml_from_float_t kv_from_float = kv_type_traits_cpu->from_float;
const ggml_vec_dot_t kv_vec_dot = kv_type_traits_cpu->vec_dot;
const size_t kv_type_size = ggml_type_size(kv_type);
// broadcast factors
const int64_t rk2 = neq2/nek2;
@@ -8424,8 +8421,6 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
static constexpr int Q_TILE_SZ = ggml_fa_tile_config::Q;
static constexpr int KV_TILE_SZ = ggml_fa_tile_config::KV;
GGML_ASSERT(nek1 % KV_TILE_SZ == 0 && "KV sequence length must be divisible by KV_TILE_SZ");
int ir = ir0;
while (ir < ir1) {
// q indices for the start of this tile
@@ -8452,18 +8447,20 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
}
// Per-thread scratch layout:
// Q_q: Q_TILE_SZ * DK (converted Q tile in KV type)
// Q_q: Q_TILE_SZ * DK (converted Q tile — F32 for GEMM, KV type for scalar)
// KQ: Q_TILE_SZ * KV_TILE_SZ (attention scores in float)
// mask: Q_TILE_SZ * KV_TILE_SZ (mask in float)
// VKQ32: Q_TILE_SZ * DV (FP32 output accumulator)
// V32: KV_TILE_SZ * DV (F32 buffer for V tile - used for f166 conversion)
float * base = (float *) params->wdata + ith*(Q_TILE_SZ*DK + 2*Q_TILE_SZ*KV_TILE_SZ + Q_TILE_SZ*DV + KV_TILE_SZ*DV + CACHE_LINE_SIZE_F32);
// V32: KV_TILE_SZ * DV (F32 buffer for V tile)
// K_f32: KV_TILE_SZ * DK (F32 buffer for K tile — GEMM path)
float * base = (float *) params->wdata + ith*(Q_TILE_SZ*DK + 2*Q_TILE_SZ*KV_TILE_SZ + Q_TILE_SZ*DV + KV_TILE_SZ*DV + KV_TILE_SZ*DK + CACHE_LINE_SIZE_F32);
void * Q_q = base;
float * KQ = (float *)((char *)base + Q_TILE_SZ * DK * sizeof(float));
float * mask32 = KQ + Q_TILE_SZ * KV_TILE_SZ;
float * VKQ32 = mask32 + Q_TILE_SZ * KV_TILE_SZ;
float * V32 = VKQ32 + Q_TILE_SZ * DV; // F32 buffer for V tile
float * V32 = VKQ32 + Q_TILE_SZ * DV;
float * K_f32 = V32 + KV_TILE_SZ * DV;
memset(VKQ32, 0, Q_TILE_SZ * DV * sizeof(float));
memset(mask32, 0, Q_TILE_SZ * KV_TILE_SZ * sizeof(float));
@@ -8476,28 +8473,38 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
const int iv3 = iq3 / rv3;
const int iv2 = iq2 / rv2;
for (int tq = 0; tq < tile_rows; tq++) {
const float * pq = (const float *) ((char *) q->data + ((iq1 + tq)*nbq1 + iq2*nbq2 + iq3*nbq3));
kv_from_float(pq, (char *)Q_q + tq * DK * kv_type_size, DK);
}
// Zero-pad remaining rows
for (int tq = tile_rows; tq < Q_TILE_SZ; tq++) {
memset((char *)Q_q + tq * DK * kv_type_size, 0, DK * kv_type_size);
{
float * Q_f32 = (float *)Q_q;
for (int tq = 0; tq < tile_rows; tq++) {
const float * pq = (const float *) ((char *) q->data + ((iq1 + tq)*nbq1 + iq2*nbq2 + iq3*nbq3));
memcpy(Q_f32 + tq * DK, pq, DK * sizeof(float));
}
for (int tq = tile_rows; tq < Q_TILE_SZ; tq++) {
memset(Q_f32 + tq * DK, 0, DK * sizeof(float));
}
}
memset(K_f32, 0, DK * KV_TILE_SZ * sizeof(float));
memset(V32, 0, KV_TILE_SZ * DV * sizeof(float));
for (int64_t ic = 0; ic < nek1; ic += KV_TILE_SZ) {
const int kv_tile = (int)std::min((int64_t)KV_TILE_SZ, nek1 - ic);
// skip the tile entirely if all the masks are -inf
if (mask) {
bool can_skip = true;
for (int tq = 0; tq < tile_rows; tq++) {
const ggml_fp16_t * mp_row = (const ggml_fp16_t *)((const char *) mask->data + (iq1 + tq)*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]);
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
for (int tk = 0; tk < kv_tile; tk++) {
mask32[tq * KV_TILE_SZ + tk] = slope * GGML_CPU_FP16_TO_FP32(mp_row[ic + tk]);
if (mask32[tq * KV_TILE_SZ + tk] != -INFINITY) {
can_skip = false;
}
}
// Pad remaining mask entries with -inf
for (int tk = kv_tile; tk < KV_TILE_SZ; tk++) {
mask32[tq * KV_TILE_SZ + tk] = -INFINITY;
}
}
if (can_skip) {
@@ -8505,13 +8512,32 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
}
}
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
const void * q_row = (const char *)Q_q + tq * DK * kv_type_size;
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
const void * k_row = (const char *) k->data + ((ic + tk)*nbk1 + ik2*nbk2 + ik3*nbk3);
float s;
kv_vec_dot(DK, &s, 0, k_row, 0, q_row, 0, 1);
KQ[tq * KV_TILE_SZ + tk] = s * scale;
// Pack K tile transposed: K_f32[dk][kv] so KV_TILE is contiguous (SIMD dim)
// Zero-pad the last tile so the GEMM always operates on KV_TILE_SZ columns
for (int tk = 0; tk < kv_tile; tk++) {
const char * k_data = (const char *)k->data + (ic + tk)*nbk1 + ik2*nbk2 + ik3*nbk3;
if (kv_type == GGML_TYPE_F16) {
const ggml_fp16_t * k_f16 = (const ggml_fp16_t *)k_data;
for (int64_t dk = 0; dk < DK; dk++) {
K_f32[dk * KV_TILE_SZ + tk] = GGML_CPU_FP16_TO_FP32(k_f16[dk]);
}
} else {
const float * k_f32_src = (const float *)k_data;
for (int64_t dk = 0; dk < DK; dk++) {
K_f32[dk * KV_TILE_SZ + tk] = k_f32_src[dk];
}
}
}
memset(KQ, 0, Q_TILE_SZ * KV_TILE_SZ * sizeof(float));
simd_gemm(KQ, (const float *)Q_q, K_f32, Q_TILE_SZ, DK, KV_TILE_SZ);
ggml_vec_scale_f32(Q_TILE_SZ * KV_TILE_SZ, KQ, scale);
// Set padded KQ entries to -inf so softmax gives them zero weight
if (kv_tile < KV_TILE_SZ) {
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
for (int tk = kv_tile; tk < KV_TILE_SZ; tk++) {
KQ[tq * KV_TILE_SZ + tk] = -INFINITY;
}
}
}
@@ -8551,33 +8577,22 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
S[tq] += ggml_vec_soft_max_f32(KV_TILE_SZ, kq_row, kq_row, Mnew);
}
// Convert V tile to F32 first (if F16), then do MAD
// On x86, ggml_vec_mad_f16 internall converts F16<->F32 on every load/store, so pre-converting is faster.
// TODO: on ARM, native f16 should be faster
if (kv_type == GGML_TYPE_F16) {
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
const ggml_fp16_t * v_row = (const ggml_fp16_t *)((const char *) v->data + ((ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3));
ggml_fp16_to_fp32_row(v_row, V32 + tk * DV, DV);
}
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
if (skip[tq]) continue;
float * vkq_row = VKQ32 + tq * DV;
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
const float p = KQ[tq * KV_TILE_SZ + tk];
ggml_vec_mad_f32(DV, vkq_row, V32 + tk * DV, p);
}
}
} else {
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
if (skip[tq]) continue;
float * vkq_row = VKQ32 + tq * DV;
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
const float p = KQ[tq * KV_TILE_SZ + tk];
const float * v_row = (const float *)((const char *) v->data + ((ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3));
ggml_vec_mad_f32(DV, vkq_row, v_row, p);
}
// V accumulation: VKQ32 += softmax(KQ) * V
// Pack V tile to contiguous F32, zero-padded
for (int tk = 0; tk < kv_tile; tk++) {
const char * v_data = (const char *)v->data + (ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3;
if (kv_type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((const ggml_fp16_t *)v_data, V32 + tk * DV, DV);
} else {
memcpy(V32 + tk * DV, v_data, DV * sizeof(float));
}
}
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
if (skip[tq]) {
memset(KQ + tq * KV_TILE_SZ, 0, KV_TILE_SZ * sizeof(float));
}
}
simd_gemm(VKQ32, KQ, V32, Q_TILE_SZ, KV_TILE_SZ, DV);
}
// sinks (apply only to valid rows in the tile)
@@ -8794,15 +8809,15 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int64_t dr = (nr + nchunk - 1) / nchunk;
static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
const bool use_tiled = !use_ref &&
bool use_tiled = !use_ref &&
(q->type == GGML_TYPE_F32 &&
kv_is_f32_or_f16 &&
k->type == v->type &&
nek1 % KV_TILE_SZ == 0 &&
neq1 >= Q_TILE_SZ);
#ifdef GGML_SIMD
use_tiled &= (DV % GGML_F32_EPR == 0);
#endif
int current_chunk = ith;
while (current_chunk < nchunk) {
+139
View File
@@ -0,0 +1,139 @@
#pragma once
// Computes C[M x N] += A[M x K] * B[K x N]
#include "ggml-cpu-impl.h"
#include "vec.h"
#include "common.h"
#include "simd-mappings.h"
// TODO: add support for sizeless vector types
#if defined(GGML_SIMD) && !defined(__ARM_FEATURE_SVE) && !defined(__riscv_v_intrinsic)
// TODO: untested on avx512
// These are in units of GGML_F32_EPR
#if defined(__AVX512F__) || defined (__ARM_NEON__)
static constexpr int GEMM_RM = 4;
static constexpr int GEMM_RN = 4; // 16+4+1 = 25/32
#elif defined(__AVX2__) || defined(__AVX__)
static constexpr int GEMM_RM = 6;
static constexpr int GEMM_RN = 2; // 12+2+1 = 15/16
#else
static constexpr int GEMM_RM = 2;
static constexpr int GEMM_RN = 2;
#endif
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations"
#endif
template <int RM, int RN>
static inline void simd_gemm_ukernel(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int64_t K, int64_t N,
int64_t ii, int64_t jj)
{
static constexpr int KN = GGML_F32_EPR;
GGML_F32_VEC acc[RM][RN];
for (int i = 0; i < RM; i++) {
for (int r = 0; r < RN; r++) {
acc[i][r] = GGML_F32_VEC_LOAD(C + (ii + i) * N + jj + r * KN);
}
}
for (int64_t kk = 0; kk < K; kk++) {
GGML_F32_VEC Bv[RN];
for (int r = 0; r < RN; r++) {
Bv[r] = GGML_F32_VEC_LOAD(B + kk * N + jj + r * KN);
}
for (int i = 0; i < RM; i++) {
GGML_F32_VEC p = GGML_F32_VEC_SET1(A[(ii + i) * K + kk]);
for (int r = 0; r < RN; r++) {
acc[i][r] = GGML_F32_VEC_FMA(acc[i][r], Bv[r], p);
}
}
}
for (int i = 0; i < RM; i++) {
for (int r = 0; r < RN; r++) {
GGML_F32_VEC_STORE(C + (ii + i) * N + jj + r * KN, acc[i][r]);
}
}
}
// C[M x N] += A[M x K] * B[K x N]
static void simd_gemm(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int64_t M, int64_t K, int64_t N)
{
static constexpr int KN = GGML_F32_EPR;
int64_t ii = 0;
for (; ii + GEMM_RM <= M; ii += GEMM_RM) {
int64_t jj = 0;
for (; jj + GEMM_RN * KN <= N; jj += GEMM_RN * KN) {
simd_gemm_ukernel<GEMM_RM, GEMM_RN>(C, A, B, K, N, ii, jj);
}
for (; jj + KN <= N; jj += KN) {
simd_gemm_ukernel<GEMM_RM, 1>(C, A, B, K, N, ii, jj);
}
for (; jj < N; jj++) {
for (int i = 0; i < GEMM_RM; i++) {
float a = C[(ii + i) * N + jj];
for (int64_t kk = 0; kk < K; kk++) {
a += A[(ii + i) * K + kk] * B[kk * N + jj];
}
C[(ii + i) * N + jj] = a;
}
}
}
// Tail rows: one at a time
for (; ii < M; ii++) {
int64_t jj = 0;
for (; jj + GEMM_RN * KN <= N; jj += GEMM_RN * KN) {
simd_gemm_ukernel<1, GEMM_RN>(C, A, B, K, N, ii, jj);
}
for (; jj + KN <= N; jj += KN) {
simd_gemm_ukernel<1, 1>(C, A, B, K, N, ii, jj);
}
for (; jj < N; jj++) {
float a = C[ii * N + jj];
for (int64_t kk = 0; kk < K; kk++) {
a += A[ii * K + kk] * B[kk * N + jj];
}
C[ii * N + jj] = a;
}
}
}
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
#else // scalar path
static void simd_gemm(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int64_t M, int64_t K, int64_t N)
{
for (int64_t i = 0; i < M; i++) {
for (int64_t j = 0; j < N; j++) {
float sum = C[i * N + j];
for (int64_t kk = 0; kk < K; kk++) {
sum += A[i * K + kk] * B[kk * N + j];
}
C[i * N + j] = sum;
}
}
}
#endif // GGML_SIMD
+5 -1
View File
@@ -2872,6 +2872,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
const std::string delta_net_prefix = "dnet_add";
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
@@ -2902,7 +2903,8 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 &&
strncmp(node->name, delta_net_prefix.c_str(), delta_net_prefix.size()) != 0) {
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
// by means of matching node names. See
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
@@ -4544,6 +4546,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_ROUND:
case GGML_UNARY_OP_TRUNC:
// TODO: should become:
//return ggml_is_contiguous_rows(op->src[0]);
return ggml_is_contiguous(op->src[0]);
default:
return false;
@@ -273,6 +273,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
case GGML_OP_DIAG:
case GGML_OP_MUL:
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_DIV:
case GGML_OP_GLU:
case GGML_OP_SCALE:
+1
View File
@@ -3830,6 +3830,7 @@ class VisionProjectorType:
MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v"
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
# Items here are (block size, type size)
+10 -1
View File
@@ -1346,6 +1346,7 @@ class TensorNameMap:
"model.vision_tower.embeddings.cls_token", # Intern-S1
"vision_model.class_embedding", # llama 4
"model.vision.patch_embedding.cls_embedding", # cogvlm
"vision_model.radio_model.model.patch_generator.cls_token.token", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
@@ -1360,6 +1361,7 @@ class TensorNameMap:
"vision_tower.patch_embed.proj", # kimi-vl
"model.vision.patch_embedding.proj", # cogvlm
"siglip2.vision_model.embeddings.patch_embedding",
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@@ -1376,12 +1378,14 @@ class TensorNameMap:
"visual.pos_embed", # qwen3vl
"model.vision.patch_embedding.position_embedding", # cogvlm
"visual.embeddings.position_embedding", # glm4v
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_QKV: (
"visual.blocks.{bid}.attn.qkv", # qwen3vl
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
"vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
"vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1446,6 +1450,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1462,6 +1467,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1477,6 +1483,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1493,6 +1500,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1515,6 +1523,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_LAYER_SCALE_1: (
-6
View File
@@ -489,9 +489,6 @@ private:
ggml_tensor * build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il);
ggml_tensor * build_layer_ffn(
@@ -506,9 +503,6 @@ private:
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il);
// returns pair of output and new state
+257 -293
View File
@@ -16,17 +16,6 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = build_inp_out_ids();
ggml_tensor * causal_mask =
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
GGML_TRI_TYPE_LOWER);
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
ggml_build_forward_expand(gf, causal_mask);
ggml_build_forward_expand(gf, identity);
ggml_build_forward_expand(gf, diag_mask);
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
@@ -36,7 +25,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
// Determine layer type and build appropriate attention mechanism
if (hparams.is_recurrent(il)) {
// Linear attention layer (gated delta net)
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
cur = build_layer_attn_linear(inp->get_recr(), cur, il);
} else {
// Full attention layer
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
@@ -99,11 +88,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
ggml_tensor * b,
ggml_tensor * s,
int il) {
const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1];
@@ -113,134 +99,123 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
const int64_t S_v = v->ne[0];
const int64_t H_v = v->ne[1];
GGML_ASSERT(v->ne[2] == n_tokens);
GGML_ASSERT(k->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
GGML_ASSERT(S_k == S_v);
GGML_ASSERT(H_v % H_k == 0);
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
GGML_ASSERT(b->ne[0] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
const float eps_norm = hparams.f_norm_rms_eps;
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
const float scale = 1.0f / sqrtf(S_v);
const float scale = 1.0f / sqrtf(S_k);
q = ggml_scale(ctx0, q, scale);
beta = ggml_sigmoid(ctx0, beta);
cb(q, "q_in", il);
cb(k, "k_in", il);
cb(v, "v_in", il);
cb(beta, "beta_in", il);
cb(b, "b_in", il);
cb(g, "g_in", il);
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
g = ggml_permute(ctx0, g, 2, 1, 3, 0); // [ 1, n_tokens, H_v, n_seqs]
b = ggml_permute(ctx0, b, 2, 0, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
const int CS = CHUNK_SIZE;
cb(q, "q_perm", il);
cb(k, "k_perm", il);
cb(v, "v_perm", il);
cb(beta, "beta_perm", il);
cb(g, "g_perm", il);
cb(state, "state_in", il);
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
// Do padding
const int64_t chunk_size = CHUNK_SIZE;
const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
const int64_t n_chunks = (n_tokens + pad) / chunk_size;
const int pad = (CS - n_tokens % CS) % CS;
const int n_chunks = (n_tokens + pad) / CS;
q = ggml_pad(ctx0, q, 0, pad, 0, 0);
k = ggml_pad(ctx0, k, 0, pad, 0, 0);
v = ggml_pad(ctx0, v, 0, pad, 0, 0);
g = ggml_pad(ctx0, g, pad, 0, 0, 0);
beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
g = ggml_pad(ctx0, g, 0, pad, 0, 0);
b = ggml_pad(ctx0, b, 0, pad, 0, 0);
cb(q, "q_pad", il);
cb(k, "k_pad", il);
cb(v, "v_pad", il);
cb(beta, "beta_pad", il);
cb(g, "g_pad", il);
ggml_tensor * v_b = ggml_mul(ctx0, v, b);
ggml_tensor * k_b = ggml_mul(ctx0, k, b);
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
cb(v_b, "v_b", il);
cb(k_b, "k_b", il);
cb(v_beta, "v_beta", il);
cb(k_beta, "k_beta", il);
q = ggml_reshape_4d(ctx0, q, S_k, CS, n_chunks, H_k * n_seqs);
k = ggml_reshape_4d(ctx0, k, S_k, CS, n_chunks, H_k * n_seqs);
k_b = ggml_reshape_4d(ctx0, k_b, S_k, CS, n_chunks, H_v * n_seqs);
v = ggml_reshape_4d(ctx0, v, S_v, CS, n_chunks, H_v * n_seqs);
v_b = ggml_reshape_4d(ctx0, v_b, S_v, CS, n_chunks, H_v * n_seqs);
q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
g = ggml_reshape_4d(ctx0, g, CS, 1, n_chunks, H_v * n_seqs);
b = ggml_reshape_4d(ctx0, b, 1, CS, n_chunks, H_v * n_seqs);
g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
// [CS, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_cs = ggml_cumsum(ctx0, g);
cb(g_cs, "g_cs", il);
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_cs_i = g_cs;
ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j_broadcast =
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
// [CS, CS, n_chunks, H_v * n_seqs]
ggml_tensor * decay_mask;
decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
decay_mask = ggml_exp(ctx0, decay_mask);
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
cb(decay_mask, "decay_mask", il);
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
// [CS, CS, n_chunks, H_k * n_seqs]
ggml_tensor * kb;
kb = ggml_mul_mat(ctx0, k, k_b);
kb = ggml_mul (ctx0, kb, decay_mask);
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
// [CS, CS, n_chunks, H_k * n_seqs]
ggml_tensor * attn;
attn = ggml_tri(ctx0, kb, GGML_TRI_TYPE_LOWER);
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
ggml_tensor * identity;
identity = ggml_view_1d(ctx0, attn, CS, 0);
identity = ggml_fill (ctx0, identity, 1.0f);
identity = ggml_diag (ctx0, identity);
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_mul(ctx0, lin_solve, causal_mask);
attn = ggml_add(ctx0, attn, identity);
cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * lhs = ggml_add(ctx0, attn, identity);
cb(lhs, "dnet_add_ch_lhs", il);
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
attn = ggml_neg(ctx0, attn);
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_add(ctx0, lin_solve, identity);
cb(attn, "dnet_add_ch_attn_solved", il); // [CS, CS, n_chunks, H_k * n_seqs]
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
// [S_v, CS, n_chunks, H_v * n_seqs]
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_b)), attn);
ggml_tensor * k_cumdecay =
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
// [CS, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_exp = ggml_exp(ctx0, g_cs);
ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
k_b = ggml_cont(ctx0, ggml_transpose(ctx0, k_b));
// [CS, S_k, n_chunks, H_k * n_seqs]
ggml_tensor * kbg = ggml_mul(ctx0, k_b, g_exp);
cb(kbg, "k_beta_g_exp", il);
// [S_k, CS, n_chunks, H_k * n_seqs]
ggml_tensor * k_cd = ggml_mul_mat(ctx0, kbg, attn);
cb(k_cd, "k_cumdecay", il);
// [S_k, CS, n_chunks, H_k * n_seqs]
ggml_tensor * g_exp_t = ggml_transpose(ctx0, g_exp);
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, g_exp_t);
// [CS, CS, n_chunks, H_k * n_seqs]
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
kq = ggml_mul(ctx0, kq, decay_mask);
kq = ggml_tri(ctx0, kq, GGML_TRI_TYPE_LOWER_DIAG);
cb(kq, "kq", il);
// vectorized calculation of key_gdiff
// improved from the chunked version:
@@ -250,109 +225,98 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
// get last element in g_cumsum along chunk_size dimension (ne0)
// get last element in g_cumsum along CS dimension (ne0)
// example: [[x, y, z, ..., last], ...] -> [[last], ...]
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
(g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
// [1, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cs, 1, 1, g_cs->ne[2], g_cs->ne[3],
g_cs->nb[1],
g_cs->nb[2],
g_cs->nb[3],
ggml_row_size(g_cs->type, g_cs->ne[0] - 1));
cb(g_last, "g_last", il);
// TODO: remove this cont when CUDA supports non-cont unary ops
g_last = ggml_cont(ctx0, g_last);
cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
// [1, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
cb(g_last_exp, "g_last_exp", il);
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
// [CS, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cs, g_last));
cb(g_diff, "g_diff", il);
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
1, chunk_size, n_chunks, g_diff_exp->ne[3]);
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * g_diff_exp_t = ggml_transpose(ctx0, g_diff_exp);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
// [S_k, CS, n_chunks, H_v * n_seqs]
ggml_tensor * kg = ggml_mul(ctx0, k, g_diff_exp_t);
cb(kg, "key_gdiff", il);
ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
// [CS, S_k, n_chunks, H_v * n_seqs]
ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
cb(kg_t, "key_gdiff_t", il);
ggml_tensor * s_t = ggml_transpose(ctx0, s);
s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
cb(s_t, "dnet_add_ch_state", il);
// state to be updated per chunk
ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
// shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * core_attn_out = nullptr;
// [CS, S_v, n_chunks, H_v * n_seqs]
ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
// shape: (S_k, chunk_size, 1, H_k * n_seqs)
ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
ggml_tensor * ch_k_cd = get_slice_2d(ctx0, k_cd, chunk); // [S_k, CS, 1, H_k * n_seqs]
ggml_tensor * ch_v_t = get_slice_2d(ctx0, v_t, chunk); // [ CS, S_v, 1, H_v * n_seqs]
ggml_tensor * ch_kq = get_slice_2d(ctx0, kq, chunk); // [ CS, CS, 1, H_k * n_seqs]
ggml_tensor * ch_q_g_exp = get_slice_2d(ctx0, q_g_exp, chunk); // [S_k, CS, 1, H_k * n_seqs]
ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
// shape: (S_v, chunk_size, 1, H_v * n_seqs)
ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
// [CS, S_v, 1, H_v * n_seqs]
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
cb(v_t_p, "v_prime", il);
// shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
// [CS, S_v, 1, H_v * n_seqs]
ggml_tensor * v_t_new = ggml_sub(ctx0, ch_v_t, v_t_p);
cb(v_t_new, "v_t_new", il);
// shape: (chunk_size, 1, H_v * n_seqs)
ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
// [S_v, CS, 1, H_v * n_seqs]
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_t_new, ch_kq);
cb(v_attn, "v_attn", il);
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
// replaced by precomputed attn_kq
ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
cb(attn_chunk, "attn_chunk", il);
// [S_v, CS, 1, H_v * n_seqs]
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
cb(attn_inter, "attn_inter", il);
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
// [S_v, CS, 1, H_v * n_seqs]
ggml_tensor * o_ch = ggml_add(ctx0, attn_inter, v_attn);
cb(o_ch, "dnet_add_ch_attn_out", il);
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
// v_new = v_i - v_prime
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
cb(v_new, "v_new_chunk", il);
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
cb(attn_inter, "attn_inter_chunk", il);
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
cb(v_attn, "v_attn_chunk", il);
ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
core_attn_out = core_attn_out == nullptr
? core_attn_out_chunk
: ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
v = ggml_set_inplace(ctx0, v, o_ch, v->nb[1], v->nb[2], v->nb[3], chunk * v->nb[2]);
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
//ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
// TODO: head broadcast might not work here - probably will need a transpose
ggml_tensor * kgv = ggml_mul_mat(ctx0, ch_kg_t, v_t_new); // [S_k, S_v, 1, H_k * n_seqs]
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
new_state = ggml_add(ctx0,
ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
ggml_tensor * ch_g_last_exp = get_slice_2d(ctx0, g_last_exp, chunk);
s_t = ggml_mul(ctx0, s_t, ch_g_last_exp);
s_t = ggml_add(ctx0, s_t, kgv);
cb(s_t, "dnet_add_ch_state", il);
}
s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
// truncate padded tokens
ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
ggml_tensor * o = ggml_view_4d(ctx0, v,
S_v, n_tokens, H_v, n_seqs,
ggml_row_size(core_attn_out->type, S_v),
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
output_tokens = ggml_cont(ctx0, output_tokens);
cb(output_tokens, "output_tokens", il);
ggml_row_size(v->type, S_v),
ggml_row_size(v->type, S_v * CS * n_chunks),
ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
// permute back to (S_v, H_v, n_tokens, n_seqs)
output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
output_tokens = ggml_cont(ctx0, output_tokens);
o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
return {output_tokens, new_state};
return {o, s};
}
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
@@ -360,8 +324,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_aut
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * b, // beta
ggml_tensor * s, // state
int il) {
const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1];
@@ -371,75 +335,72 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_aut
const int64_t S_v = v->ne[0];
const int64_t H_v = v->ne[1];
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
GGML_ASSERT(v->ne[2] == n_tokens);
GGML_ASSERT(k->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
GGML_ASSERT(n_tokens == 1);
GGML_ASSERT(S_k == S_v);
GGML_ASSERT(H_v % H_k == 0);
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
GGML_ASSERT(b->ne[0] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
const float eps_norm = hparams.f_norm_rms_eps;
const float scale = 1.0f / sqrtf(S_k);
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
q = ggml_scale(ctx0, q, scale);
const float scale = 1.0f / sqrtf(S_v);
q = ggml_scale(ctx0, q, scale);
beta = ggml_sigmoid(ctx0, beta);
q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
cb(q, "q_in", il);
cb(k, "k_in", il);
cb(v, "v_in", il);
cb(beta, "beta_in", il);
cb(b, "b_in", il);
cb(g, "g_in", il);
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
g = ggml_reshape_4d(ctx0, g, 1, 1, H_v, n_seqs);
b = ggml_reshape_4d(ctx0, b, 1, 1, H_v, n_seqs);
ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
// [S_v, S_v, H_v, n_seqs]
g = ggml_exp(ctx0, g);
s = ggml_mul(ctx0, s, g);
// Apply exponential to g_t
g_t = ggml_exp(ctx0, g_t);
ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
// Apply the gated delta rule for the single timestep
// last_recurrent_state = last_recurrent_state * g_t
state = ggml_mul(ctx0, state, g_t);
// [1, S_v, H_v, n_seqs]
ggml_tensor * sk;
sk = ggml_mul (ctx0, s_t, k);
sk = ggml_sum_rows(ctx0, sk);
// kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
// we need to sum over dim=-2, so we transpose, sum, then transpose again
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
// [S_v, 1, H_v, n_seqs]
ggml_tensor * d;
d = ggml_sub(ctx0, v, ggml_transpose(ctx0, sk));
d = ggml_mul(ctx0, d, b);
// v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
// delta = (v_t - kv_mem) * beta_t
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
// [1, S_v, H_v, n_seqs]
ggml_tensor * d_t;
d_t = ggml_transpose(ctx0, d);
// last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
state = ggml_add(ctx0, state, k_t_delta);
// [S_v, S_v, H_v, n_seqs]
ggml_tensor * kd;
k = ggml_repeat(ctx0, k, s);
kd = ggml_mul (ctx0, k, d_t);
// Compute the attention output
// core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
// again, since it's over dim = -2, transpose, sum, transpose back
ggml_tensor * core_attn_out =
ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
s_t = ggml_add(ctx0, s_t, kd);
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
cb(core_attn_out, "output_tokens", il);
cb(state, "new_state", il);
cb(s_t, "dnet_add_ar_state", il);
return {core_attn_out, state};
ggml_tensor * s_q = ggml_mul (ctx0, s_t, q);
ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
return {o, s};
}
ggml_tensor * llm_build_qwen3next::build_norm_gated(
@@ -472,39 +433,29 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
// Split Q projection into query and gate
// The split should be along dimension 0 (the feature dimension)
ggml_tensor * Qcur = ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], 0);
Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], 0);
cb(Qcur, "Qcur_view", il);
ggml_tensor * gate =
ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], n_embd_head * ggml_element_size(Qcur_full));
cb(Qcur, "Qcur", il);
cb(gate, "gate", il);
// Now reshape Qcur to [n_embd_head, n_head, n_tokens] for multi-head attention
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur_reshaped", il);
// Apply Q normalization
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
// Apply K normalization
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
// Reshape gate to [n_embd, n_tokens] for the sigmoid gating (flatten the heads)
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
cb(gate, "gate_reshaped", il);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
// Apply RoPE
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -519,7 +470,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
// Attention computation
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp,
@@ -527,10 +477,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_pregate", il);
ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
cb(gate_sigmoid, "gate_sigmoid", il);
// TODO: CUDA is missing non-contiguous unary ops. when implemented: remove this cont
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
cur = ggml_mul(ctx0, cur, gate_sigmoid);
gate = ggml_sigmoid(ctx0, gate);
cb(gate, "gate_sigmoid", il);
gate = ggml_reshape_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
cur = ggml_mul(ctx0, cur, gate);
cb(cur, "attn_gated", il);
cur = build_lora_mm(model.layers[il].wo, cur);
@@ -560,7 +515,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
cb(z, "z", il);
return { qkv_mixed, z };
} else {
// legacy (slower) path
ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
@@ -624,9 +578,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il) {
const auto * mctx_cur = inp->mctx;
@@ -671,7 +622,12 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
cb(a, "a", il);
ggml_tensor * beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
// TODO: CUDA is missing non-contiguous unary ops. when implemented: remove this cont
b = ggml_cont(ctx0, b);
ggml_tensor * beta = ggml_sigmoid(ctx0, b);
beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
// Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
@@ -679,6 +635,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
cb(alpha_softplus, "a_softplus", il);
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
cb(gate, "gate", il);
@@ -686,8 +643,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
// bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
// Build the convolution states tensor
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
cb(conv_states, "conv_states", il);
@@ -696,11 +651,12 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
const int64_t conv_kernel_size = conv_kernel->ne[0];
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
cb(conv_states, "conv_states_reshaped", il);
qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
cb(qkv_mixed, "qkv_mixed_permuted", il);
qkv_mixed = ggml_transpose(ctx0, qkv_mixed);
cb(qkv_mixed, "qkv_mixed_transposed", il);
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
cb(conv_input, "conv_input", il);
@@ -720,7 +676,10 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
cb(conv_states_all, "conv_states_updated", il);
// Apply SSM convolution
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
cb(state, "state_predelta", il);
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
cb(conv_output_proper, "conv_output_raw", il);
@@ -734,26 +693,36 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
// Extract the convolved Q, K, V from conv_output
ggml_tensor * q_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
ggml_row_size(conv_qkv_mix->type, head_k_dim),
nb1_qkv,
nb1_qkv * n_seq_tokens,
0);
ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
ggml_row_size(conv_qkv_mix->type, head_k_dim),
nb1_qkv,
nb1_qkv * n_seq_tokens,
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs,
ggml_row_size(conv_qkv_mix->type, head_v_dim),
nb1_qkv,
nb1_qkv * n_seq_tokens,
ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads));
cb(q_conv, "q_conv", il);
ggml_tensor * k_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(k_conv, "k_conv", il);
ggml_tensor * v_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(v_conv, "v_conv", il);
// Unsqueeze them
q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
const float eps_norm = hparams.f_norm_rms_eps;
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
cb(state, "state_predelta", il);
q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm);
k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm);
//q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
//k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
//v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// if head keys and value keys are different, repeat to force tensors into matching shapes
if (num_k_heads != num_v_heads) {
@@ -786,7 +755,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
if (n_seq_tokens == 1) {
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
} else {
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
}
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
@@ -795,19 +764,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
// Update the recurrent states
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, new_state,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// Reshape both attn_out_final and z to 2D tensors for normalization
// attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
ggml_cpy(ctx0, new_state,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// Apply gated normalization: self.norm(core_attn_out, z)
ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
// Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
@@ -818,7 +783,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(cur, "linear_attn_out", il);
// Reshape back to original dimensions
cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
return cur;
}
@@ -839,7 +805,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
if (model.layers[il].ffn_up_shexp != nullptr) {
ggml_tensor * ffn_shexp =
build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
@@ -852,11 +818,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
cb(shared_gate, "shared_expert_gate", il);
// Apply sigmoid to the gate
shared_gate = ggml_sigmoid(ctx0, shared_gate);
cb(shared_gate, "shared_expert_gate_sigmoid", il);
// Apply the gate to the shared expert output
ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
cb(ffn_shexp, "ffn_shexp_gated", il);
+1
View File
@@ -20,6 +20,7 @@ add_library(mtmd
models/internvl.cpp
models/kimivl.cpp
models/kimik25.cpp
models/nemotron-v2-vl.cpp
models/llama4.cpp
models/llava.cpp
models/minicpmv.cpp
+2
View File
@@ -236,6 +236,7 @@ enum projector_type {
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_KIMIK25,
PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -270,6 +271,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
+1
View File
@@ -15,6 +15,7 @@ enum ffn_op_type {
FFN_GELU_ERF,
FFN_SILU,
FFN_GELU_QUICK,
FFN_RELU_SQR,
};
enum norm_type {
+21
View File
@@ -559,6 +559,12 @@ ggml_tensor * clip_graph::build_ffn(
cur = ggml_gelu_quick(ctx0, cur);
cb(cur, "ffn_gelu_quick", il);
} break;
case FFN_RELU_SQR:
{
cur = ggml_relu(ctx0, cur);
cur = ggml_sqr(ctx0, cur);
cb(cur, "ffn_relu_sqr", il);
} break;
}
if (down) {
@@ -810,6 +816,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_internvl>(ctx, img);
} break;
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
builder = std::make_unique<clip_graph_nemotron_v2_vl>(ctx, img);
} break;
case PROJECTOR_TYPE_LLAMA4:
{
builder = std::make_unique<clip_graph_llama4>(ctx, img);
@@ -1110,6 +1120,7 @@ struct clip_model_loader {
}
} break;
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
@@ -1767,6 +1778,12 @@ struct clip_model_loader {
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
} break;
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
} break;
case PROJECTOR_TYPE_GLMA:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@@ -3088,6 +3105,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
case PROJECTOR_TYPE_GLM_EDGE:
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
clip_image_u8 resized_image;
int sz = params.image_size;
@@ -3397,6 +3415,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
case PROJECTOR_TYPE_LLAMA4:
{
// both X and Y are downscaled by the scale factor
@@ -3805,6 +3824,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_GEMMA3NV:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_ULTRAVOX:
@@ -3968,6 +3988,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
return ctx->model.mm_3_w->ne[1];
case PROJECTOR_TYPE_LLAMA4:
return ctx->model.mm_model_proj->ne[1];
+5
View File
@@ -42,6 +42,11 @@ struct clip_graph_internvl : clip_graph {
ggml_cgraph * build() override;
};
struct clip_graph_nemotron_v2_vl : clip_graph {
clip_graph_nemotron_v2_vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_llama4 : clip_graph {
clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
+35
View File
@@ -0,0 +1,35 @@
#include "models.h"
ggml_cgraph * clip_graph_nemotron_v2_vl::build() {
GGML_ASSERT(model.class_embedding != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
const int n_registers = model.class_embedding->ne[1];
const int n_pos = n_patches + n_registers;
ggml_tensor * inp = build_inp();
// add position embeddings (pre-downsampled during GGUF conversion for fixed 512x512 input)
inp = ggml_add(ctx0, inp, model.position_embeddings);
cb(inp, "inp_pos", -1);
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, hparams.ffn_op, nullptr, nullptr);
cur = ggml_view_2d(ctx0, cur,
n_embd, n_patches,
ggml_row_size(cur->type, n_embd),
n_registers * ggml_row_size(cur->type, n_embd));
cur = build_patch_merge_permute(cur, model.hparams.n_merge);
{
cur = build_norm(cur, model.mm_0_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
cur = build_ffn(cur, model.mm_1_w, nullptr, nullptr, nullptr, model.mm_3_w, nullptr, FFN_RELU_SQR, -1);
}
ggml_build_forward_expand(gf, cur);
return gf;
}