mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
model : support Gemma4_26B_A4B_NVFP4 (#22804)
* Gemma4_26B_A4B_NvFp4 hf checkpoint convert to gguf format fixes Signed-off-by: ynankani <ynankani@nvidia.com> * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Address review comments Signed-off-by: ynankani <ynankani@nvidia.com> * fix CRLF Signed-off-by: ynankani <ynankani@nvidia.com> * Lint error fix Signed-off-by: ynankani <ynankani@nvidia.com> --------- Signed-off-by: ynankani <ynankani@nvidia.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
+25
-1
@@ -7988,13 +7988,37 @@ class Gemma4Model(Gemma3Model):
|
||||
rope_freqs_full = torch.tensor(values, dtype=torch.float32)
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
|
||||
|
||||
def _generate_nvfp4_tensors(self):
|
||||
# Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales
|
||||
# each expert's contribution. It's mathematically equivalent to a per-expert
|
||||
# scalar on the down_proj output, which is exactly where ffn_down_exps_s is
|
||||
# applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the
|
||||
# existing NVFP4 path produces the right scales.
|
||||
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
|
||||
for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]:
|
||||
bid_match = re.search(r"\.layers\.(\d+)\.", name)
|
||||
if bid_match is None:
|
||||
continue
|
||||
bid = bid_match.group(1)
|
||||
prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")]
|
||||
w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)]
|
||||
present = [w2 in self.model_tensors for w2 in w2_targets]
|
||||
if not any(present):
|
||||
continue
|
||||
assert all(present), f"layer {bid}: partial NVFP4 quantization across experts"
|
||||
r = self.model_tensors.pop(name)
|
||||
for e, w2 in enumerate(w2_targets):
|
||||
s = self.model_tensors[w2]
|
||||
self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i]
|
||||
super()._generate_nvfp4_tensors()
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
|
||||
if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
|
||||
name = name + ".weight"
|
||||
if ".experts." in name and not name.endswith(".weight"):
|
||||
if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
|
||||
name += ".weight"
|
||||
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
@@ -2443,6 +2443,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
|
||||
+11
-5
@@ -110,7 +110,13 @@ void llama_model_gemma4::load_arch_tensors(llama_model_loader &) {
|
||||
layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);
|
||||
|
||||
// MoE FFN
|
||||
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
|
||||
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (layer.ffn_gate_up_exps == nullptr) {
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
||||
}
|
||||
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
|
||||
// per-expert scale will be loaded as down_exps_s at the end of the current switch case
|
||||
@@ -286,8 +292,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
|
||||
|
||||
cur_moe = build_moe_ffn(cur_moe,
|
||||
nullptr, // gate_inp
|
||||
nullptr, // up_exps
|
||||
nullptr, // gate_exps
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr, // exp_probs_b (not used for gemma4)
|
||||
n_expert, n_expert_used,
|
||||
@@ -296,8 +302,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il, logits,
|
||||
model.layers[il].ffn_gate_up_exps,
|
||||
nullptr, // up_exps_s
|
||||
nullptr, // gate_exps_s
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
model.layers[il].ffn_gate_exps_s,
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cur_moe = build_norm(cur_moe,
|
||||
model.layers[il].ffn_post_norm_2, nullptr,
|
||||
|
||||
Reference in New Issue
Block a user