mpt : do not duplicate token_embd.weight on disk (#5670 )

gemma : use more bits for the token_embd.weight tensor (#5650 )
* gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type
2026-06-18 03:37:39 +02:00 · 2024-02-22 17:05:23 -05:00 · 2024-02-22 23:23:46 +02:00
2 changed files with 8 additions and 8 deletions
@@ -622,11 +622,6 @@ class MPTModel(Model):

            self.gguf_writer.add_tensor(new_name, data)

-            # note: MPT output is tied to (same as) wte in original model;
-            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-            if new_name == "token_embd.weight":
-                self.gguf_writer.add_tensor("output.weight", data)
-

 class OrionModel(Model):
    def set_vocab(self):
@@ -509,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
@@ -4056,7 +4055,10 @@ static bool llm_load_tensors(
                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);

-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        // same as tok_embd, duplicated to allow offloading
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
                    }

                    for (int i = 0; i < n_layer; ++i) {
@@ -10498,7 +10500,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        return std::make_pair(i_layer, n_layer);
    };

-    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+    // with the quantization of the output tensor
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
+        (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
        int nx = tensor->ne[0];
        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
            new_type = GGML_TYPE_Q8_0;
Author	SHA1	Message	Date
Jared Van Bortel	15499eb942	mpt : do not duplicate token_embd.weight on disk (#5670 )	2024-02-22 17:05:23 -05:00
Georgi Gerganov	96633eeca1	gemma : use more bits for the token_embd.weight tensor (#5650 ) * gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type	2024-02-22 23:23:46 +02:00