Compare commits

...

2 Commits

Author SHA1 Message Date
Jared Van Bortel 15499eb942 mpt : do not duplicate token_embd.weight on disk (#5670) 2024-02-22 17:05:23 -05:00
Georgi Gerganov 96633eeca1 gemma : use more bits for the token_embd.weight tensor (#5650)
* gemma : use Q8_0 for the token_embd.weight tensor

* llama : quantize token_embd.weight using output type
2024-02-22 23:23:46 +02:00
2 changed files with 8 additions and 8 deletions
-5
View File
@@ -622,11 +622,6 @@ class MPTModel(Model):
self.gguf_writer.add_tensor(new_name, data)
# note: MPT output is tied to (same as) wte in original model;
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
if new_name == "token_embd.weight":
self.gguf_writer.add_tensor("output.weight", data)
class OrionModel(Model):
def set_vocab(self):
+8 -3
View File
@@ -509,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -4056,7 +4055,10 @@ static bool llm_load_tensors(
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
// same as tok_embd, duplicated to allow offloading
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
}
for (int i = 0; i < n_layer; ++i) {
@@ -10498,7 +10500,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
return std::make_pair(i_layer, n_layer);
};
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
int nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;