model : support Gemma4_26B_A4B_NVFP4 (#22804)

* Gemma4_26B_A4B_NvFp4 hf checkpoint convert to gguf format fixes

Signed-off-by: ynankani <ynankani@nvidia.com>

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Address review comments

Signed-off-by: ynankani <ynankani@nvidia.com>

* fix CRLF

Signed-off-by: ynankani <ynankani@nvidia.com>

* Lint error fix

Signed-off-by: ynankani <ynankani@nvidia.com>

---------

Signed-off-by: ynankani <ynankani@nvidia.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
ynankani
2026-05-08 18:42:09 +00:00
committed by GitHub
parent f9cd456ea5
commit 9f5f0e689c
3 changed files with 38 additions and 6 deletions
+25 -1
View File
@@ -7988,13 +7988,37 @@ class Gemma4Model(Gemma3Model):
rope_freqs_full = torch.tensor(values, dtype=torch.float32)
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
def _generate_nvfp4_tensors(self):
# Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales
# each expert's contribution. It's mathematically equivalent to a per-expert
# scalar on the down_proj output, which is exactly where ffn_down_exps_s is
# applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the
# existing NVFP4 path produces the right scales.
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]:
bid_match = re.search(r"\.layers\.(\d+)\.", name)
if bid_match is None:
continue
bid = bid_match.group(1)
prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")]
w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)]
present = [w2 in self.model_tensors for w2 in w2_targets]
if not any(present):
continue
assert all(present), f"layer {bid}: partial NVFP4 quantization across experts"
r = self.model_tensors.pop(name)
for e, w2 in enumerate(w2_targets):
s = self.model_tensors[w2]
self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i]
super()._generate_nvfp4_tensors()
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
name = name + ".weight"
if ".experts." in name and not name.endswith(".weight"):
if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
name += ".weight"
return super().filter_tensors((name, gen))
+2
View File
@@ -2443,6 +2443,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_UP_EXP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_POST_NORM,
+11 -5
View File
@@ -110,7 +110,13 @@ void llama_model_gemma4::load_arch_tensors(llama_model_loader &) {
layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);
// MoE FFN
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, TENSOR_NOT_REQUIRED);
if (layer.ffn_gate_up_exps == nullptr) {
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
}
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
// per-expert scale will be loaded as down_exps_s at the end of the current switch case
@@ -286,8 +292,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
cur_moe = build_moe_ffn(cur_moe,
nullptr, // gate_inp
nullptr, // up_exps
nullptr, // gate_exps
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr, // exp_probs_b (not used for gemma4)
n_expert, n_expert_used,
@@ -296,8 +302,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il, logits,
model.layers[il].ffn_gate_up_exps,
nullptr, // up_exps_s
nullptr, // gate_exps_s
model.layers[il].ffn_up_exps_s,
model.layers[il].ffn_gate_exps_s,
model.layers[il].ffn_down_exps_s);
cur_moe = build_norm(cur_moe,
model.layers[il].ffn_post_norm_2, nullptr,