mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
model : support for DeepseekV32ForCausalLM with generic DeepSeek Sparse Attention (DSA) implementation (#23346)
* llama : support DeepSeek V3.2 model family (with DSA lightning indexer) * convert : handle DeepseekV32ForCausalLM architecture * ggml : support for f16 GGML_OP_FILL * memory : separate hparams argument in llama_kv_cache constructor * memory : add llama_kv_cache_dsa memory (KV cache + lightning indexer cache) * llama : support for LLM_ARCH_DEEPSEEK32 * model : llama_model_deepseek32 implementation * model : merge two scale operations into one in DSA lightning indexer implementation * chore : remove unused code * model : support NVFP4 in DeepSeek V3.2 Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * memory : refactoring TODO Co-authored-by: ggerganov <ggerganov@users.noreply.github.com> --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: ggerganov <ggerganov@users.noreply.github.com>
This commit is contained in:
@@ -47,6 +47,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"DeepseekForCausalLM": "deepseek",
|
||||
"DeepseekV2ForCausalLM": "deepseek",
|
||||
"DeepseekV3ForCausalLM": "deepseek",
|
||||
"DeepseekV32ForCausalLM": "deepseek",
|
||||
"DistilBertForMaskedLM": "bert",
|
||||
"DistilBertForSequenceClassification": "bert",
|
||||
"DistilBertModel": "bert",
|
||||
|
||||
@@ -915,6 +915,8 @@ class ModelBase:
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D_Q,
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D_K,
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D_V,
|
||||
# DSA indexer weights should be F32
|
||||
gguf.MODEL_TENSOR.INDEXER_PROJ,
|
||||
)
|
||||
)
|
||||
or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
|
||||
|
||||
@@ -386,3 +386,32 @@ class DeepseekV2Model(TextModel):
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekV32ForCausalLM")
|
||||
class DeepseekV32Model(DeepseekV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK32
|
||||
skip_mtp = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def set_vocab(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
assert getattr(tokenizer, "add_bos_token", False), "Change value of add_bos_token to true in tokenizer_config.json file."
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# NextN/MTP prediction layers
|
||||
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
|
||||
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
|
||||
|
||||
# DSA indexer parameters
|
||||
self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
|
||||
self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
|
||||
self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
|
||||
|
||||
Reference in New Issue
Block a user