llama : allow raw byte in SPM vocabs; don't crash on nl 404 (#5478 )

* common : don't crash if newline token is not found * common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs
llama : make load error reporting more granular (#5477 )
2026-06-16 10:46:43 +02:00 · 2024-02-13 18:18:16 +02:00 · 2024-02-13 15:24:50 +02:00 · 2024-02-13 15:15:42 +02:00 · 2024-02-13 15:14:22 +02:00 · 2024-02-13 14:06:58 +02:00
14 changed files with 525 additions and 313 deletions
@@ -569,6 +569,14 @@ $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef LLAMA_CUBLAS
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
+CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
+ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
+ifndef CUDA_DOCKER_ARCH
+ifndef CUDA_POWER_ARCH
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
+endif # CUDA_POWER_ARCH
+endif # CUDA_DOCKER_ARCH
+endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
 endif # LLAMA_CUBLAS
 $(info )

@@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }

+# bge-small
+
+function gg_run_embd_bge_small {
+    cd ${SRC}
+
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
+
+    path_models="../models-mnt/bge-small"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert-hf-to-gguf.py ${path_models}
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+
+    (time ./bin/embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+
+    set +e
+}
+
+function gg_sum_embd_bge_small {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'BGE Small (BERT):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+}
+
 ## main

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    test $ret -eq 0 && gg_run embd_bge_small
+
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run open_llama_3b_v2
@@ -1648,6 +1648,7 @@ class BertModel(Model):
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
        self.gguf_writer.add_causal_attention(False)
+        self.gguf_writer.add_pooling_layer(True)
        self.gguf_writer.add_file_type(self.ftype)

    def set_vocab(self):
@@ -7,6 +7,51 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+static std::vector<std::string> split_lines(const std::string & s) {
+    std::string line;
+    std::vector<std::string> lines;
+    std::stringstream ss(s);
+    while (std::getline(ss, line)) {
+        lines.push_back(line);
+    }
+    return lines;
+}
+
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
+    for (size_t i = 0; i < tokens.size(); i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, false);
+    }
+}
+
+static void normalize(float * vec, float * out, int n) {
+    float norm = 0;
+    for (int i = 0; i < n; i++) {
+        norm += vec[i] * vec[i];
+    }
+    norm = sqrt(norm);
+    for (int i = 0; i < n; i++) {
+        out[i] = vec[i] / norm;
+    }
+}
+
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+    // clear previous kv_cache values (irrelevant for embeddings)
+    llama_kv_cache_clear(ctx);
+
+    // run model
+    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    if (llama_decode(ctx, batch) < 0) {
+        fprintf(stderr, "%s : failed to decode\n", __func__);
+    }
+
+    // normalize on copy
+    for (int k = 0; k < n_seq; k++) {
+        float * emb = llama_get_embeddings_ith(ctx, k);
+        float * out = output + k * n_embd;
+        normalize(emb, out, n_embd);
+    }
+}
+
 int main(int argc, char ** argv) {
    gpt_params params;

@@ -55,59 +100,84 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

-    int n_past = 0;
+    // split the prompt into lines
+    std::vector<std::string> prompts = split_lines(params.prompt);

-    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    // max batch size
+    const uint64_t n_batch = params.n_batch;
+    GGML_ASSERT(params.n_batch == params.n_ctx);

+    // tokenize the prompts and trim
+    std::vector<std::vector<int32_t>> inputs;
+    for (const auto & prompt : prompts) {
+        auto inp = ::llama_tokenize(ctx, prompt, true);
+        if (inp.size() > n_batch) {
+            inp.resize(n_batch);
+        }
+        inputs.push_back(inp);
+    }
+
+    // tokenization stats
    if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+        for (int i = 0; i < (int) inputs.size(); i++) {
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            for (int j = 0; j < (int) inputs[i].size(); j++) {
+                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+            }
+            fprintf(stderr, "\n\n");
        }
-        fprintf(stderr, "\n");
    }

-    if (embd_inp.size() > (size_t)n_ctx) {
-        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
-                __func__, embd_inp.size(), n_ctx);
-        return 1;
-    }
-
-    while (!embd_inp.empty()) {
-        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
-        if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return 1;
-        }
-        n_past += n_tokens;
-        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
-    }
+    // initialize batch
+    const int n_prompts = prompts.size();
+    struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);

+    // allocate output
    const int n_embd = llama_n_embd(model);
-    auto * embeddings = llama_get_embeddings(ctx);
+    std::vector<float> embeddings(n_prompts * n_embd, 0);
+    float * emb = embeddings.data();

-    // l2-normalize embeddings
-    float norm = 0;
-    for (int i = 0; i < n_embd; i++) {
-        norm += embeddings[i] * embeddings[i];
-    }
-    norm = sqrt(norm);
-    for (int i = 0; i < n_embd; i++) {
-        embeddings[i] /= norm;
+    // break into batches
+    int p = 0; // number of prompts processed already
+    int s = 0; // number of prompts in current batch
+    for (int k = 0; k < n_prompts; k++) {
+        // clamp to n_batch tokens
+        auto & inp = inputs[k];
+        const uint64_t n_toks = inp.size();
+
+        // encode if at capacity
+        if (batch.n_tokens + n_toks > n_batch) {
+            float * out = emb + p * n_embd;
+            batch_decode(ctx, batch, out, s, n_embd);
+            llama_batch_clear(batch);
+            p += s;
+            s = 0;
+        }
+
+        // add to batch
+        batch_add_seq(batch, inp, s);
+        s += 1;
    }

-    for (int i = 0; i < n_embd; i++) {
-        printf("%f ", embeddings[i]);
-    }
-    printf("\n");
+    // final batch
+    float * out = emb + p * n_embd;
+    batch_decode(ctx, batch, out, s, n_embd);

+    // print first 3 embeddings
+    for (int j = 0; j < std::min(3, n_prompts); j++) {
+        fprintf(stderr, "embedding %d: ", j);
+        for (int i = 0; i < n_embd; i++) {
+            fprintf(stderr, "%f ", emb[j * n_embd + i]);
+        }
+        fprintf(stderr, "\n\n");
+    }
+    fprintf(stderr, "\n");
+
+    // clean up
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
-
    llama_backend_free();

    return 0;
@@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these
  --rank-wk N                LORA rank for wk tensor (default 4)
  --rank-wv N                LORA rank for wv tensor (default 4)
  --rank-wo N                LORA rank for wo tensor (default 4)
-  --rank-w1 N                LORA rank for w1 tensor (default 4)
-  --rank-w2 N                LORA rank for w2 tensor (default 4)
-  --rank-w3 N                LORA rank for w3 tensor (default 4)
+  --rank-ffn_gate N          LORA rank for ffn_gate tensor (default 4)
+  --rank-ffn_down N          LORA rank for ffn_down tensor (default 4)
+  --rank-ffn_up N            LORA rank for ffn_up tensor (default 4)
 ```

 The LORA rank of 'norm' tensors should always be 1.
@@ -60,9 +60,9 @@ struct my_llama_layer {
    struct ggml_tensor * ffn_norm;

    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
 };

 struct my_llama_model {
@@ -85,9 +85,9 @@ struct my_llama_lora_hparams {
    uint32_t n_rank_wv = 4;
    uint32_t n_rank_wo = 4;
    uint32_t n_rank_ffn_norm = 1;
-    uint32_t n_rank_w1 = 4;
-    uint32_t n_rank_w2 = 4;
-    uint32_t n_rank_w3 = 4;
+    uint32_t n_rank_ffn_gate = 4;
+    uint32_t n_rank_ffn_down = 4;
+    uint32_t n_rank_ffn_up = 4;
    uint32_t n_rank_tok_embeddings = 4;
    uint32_t n_rank_norm = 1;
    uint32_t n_rank_output = 4;
@@ -117,12 +117,12 @@ struct my_llama_lora_layer {
    struct ggml_tensor * ffn_norm_b;

    // ff
-    struct ggml_tensor * w1_a;
-    struct ggml_tensor * w1_b;
-    struct ggml_tensor * w2_a;
-    struct ggml_tensor * w2_b;
-    struct ggml_tensor * w3_a;
-    struct ggml_tensor * w3_b;
+    struct ggml_tensor * ffn_gate_a;
+    struct ggml_tensor * ffn_gate_b;
+    struct ggml_tensor * ffn_down_a;
+    struct ggml_tensor * ffn_down_b;
+    struct ggml_tensor * ffn_up_a;
+    struct ggml_tensor * ffn_up_b;
 };

 struct my_llama_lora {
@@ -208,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) {
    printf("%s: n_rank_wv             : %u\n", __func__, params->n_rank_wv);
    printf("%s: n_rank_wo             : %u\n", __func__, params->n_rank_wo);
    printf("%s: n_rank_ffn_norm       : %u\n", __func__, params->n_rank_ffn_norm);
-    printf("%s: n_rank_w1             : %u\n", __func__, params->n_rank_w1);
-    printf("%s: n_rank_w2             : %u\n", __func__, params->n_rank_w2);
-    printf("%s: n_rank_w3             : %u\n", __func__, params->n_rank_w3);
+    printf("%s: n_rank_ffn_gate       : %u\n", __func__, params->n_rank_ffn_gate);
+    printf("%s: n_rank_ffn_down       : %u\n", __func__, params->n_rank_ffn_down);
+    printf("%s: n_rank_ffn_up         : %u\n", __func__, params->n_rank_ffn_up);
    printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
    printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
    printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
@@ -319,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        layer.wv             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
        layer.wo             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
        layer.ffn_norm       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
-        layer.w1             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
-        layer.w2             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
-        layer.w3             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
+        layer.ffn_gate       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
+        layer.ffn_down       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
+        layer.ffn_up         = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));

        assert_shape_1d(layer.attention_norm, hparams.n_embd);
        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
@@ -329,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd_gqa());
        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
-        assert_shape_2d(layer.w1,             hparams.n_embd, hparams.n_ff);
-        assert_shape_2d(layer.w2,             hparams.n_ff,   hparams.n_embd);
-        assert_shape_2d(layer.w3,             hparams.n_embd, hparams.n_ff);
+        assert_shape_2d(layer.ffn_gate,       hparams.n_embd, hparams.n_ff);
+        assert_shape_2d(layer.ffn_down,       hparams.n_ff,   hparams.n_embd);
+        assert_shape_2d(layer.ffn_up,         hparams.n_embd, hparams.n_ff);
    }
 }

@@ -362,12 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) {
        ggml_set_param(ctx, layer.wo_b);
        ggml_set_param(ctx, layer.ffn_norm_a);
        ggml_set_param(ctx, layer.ffn_norm_b);
-        ggml_set_param(ctx, layer.w1_a);
-        ggml_set_param(ctx, layer.w1_b);
-        ggml_set_param(ctx, layer.w2_a);
-        ggml_set_param(ctx, layer.w2_b);
-        ggml_set_param(ctx, layer.w3_a);
-        ggml_set_param(ctx, layer.w3_b);
+        ggml_set_param(ctx, layer.ffn_gate_a);
+        ggml_set_param(ctx, layer.ffn_gate_b);
+        ggml_set_param(ctx, layer.ffn_down_a);
+        ggml_set_param(ctx, layer.ffn_down_b);
+        ggml_set_param(ctx, layer.ffn_up_a);
+        ggml_set_param(ctx, layer.ffn_up_b);
    }
 }

@@ -435,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
        layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
        layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);

-        layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd);
-        layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff);
-        layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff);
-        layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd);
-        layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
-        layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
+        layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
+        layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
+        layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
+        layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
+        layer.ffn_up_a   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_embd);
+        layer.ffn_up_b   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_ff);

        ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
        ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
@@ -454,12 +454,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
        ggml_set_name(layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_b", i));
        ggml_set_name(layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_a", i));
        ggml_set_name(layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_b", i));
-        ggml_set_name(layer.w1_a,             tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
-        ggml_set_name(layer.w1_b,             tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
-        ggml_set_name(layer.w2_a,             tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
-        ggml_set_name(layer.w2_b,             tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
-        ggml_set_name(layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
-        ggml_set_name(layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
    }

    set_param_lora(lora);
@@ -497,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
        randomize_tensor_normal(layer.ffn_norm_a, rnd);
        ggml_set_zero(layer.ffn_norm_b);

-        randomize_tensor_normal(layer.w1_a, rnd);
-        ggml_set_zero(layer.w1_b);
-        randomize_tensor_normal(layer.w2_a, rnd);
-        ggml_set_zero(layer.w2_b);
-        randomize_tensor_normal(layer.w3_a, rnd);
-        ggml_set_zero(layer.w3_b);
+        randomize_tensor_normal(layer.ffn_gate_a, rnd);
+        ggml_set_zero(layer.ffn_gate_b);
+        randomize_tensor_normal(layer.ffn_down_a, rnd);
+        ggml_set_zero(layer.ffn_down_b);
+        randomize_tensor_normal(layer.ffn_up_a, rnd);
+        ggml_set_zero(layer.ffn_up_b);
    }

    free_random_normal_distribution(rnd);
@@ -610,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(

        struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
        struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
-        struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
-        struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
-        struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
-        struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
-        struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
-        struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
-        struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
+        struct ggml_tensor * wq       = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
+        struct ggml_tensor * wk       = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
+        struct ggml_tensor * wv       = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
+        struct ggml_tensor * wo       = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
+        struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
+        struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
+        struct ggml_tensor * ffn_up   = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));

        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                       set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                     set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
@@ -659,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                       set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                           set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                                set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, w3, t24);                                 set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, w1, t24);                                 set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, ffn_up, t24);                             set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, ffn_gate, t24);                           set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                     set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                                set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, w2, t28);                                 set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, ffn_down, t28);                           set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                                set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
        cur = t30;
        if (enable_checkpointing) {
@@ -723,9 +723,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
    }

    // allocating checkpoints in one block to reduce memory fragmentation
@@ -798,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);

    init_lora(model, lora);

@@ -825,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
        copy_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
        copy_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
        copy_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
-        copy_tensor_by_name(layer.w1_a,             f_ggml_ctx, ggml_get_name(layer.w1_a));
-        copy_tensor_by_name(layer.w1_b,             f_ggml_ctx, ggml_get_name(layer.w1_b));
-        copy_tensor_by_name(layer.w2_a,             f_ggml_ctx, ggml_get_name(layer.w2_a));
-        copy_tensor_by_name(layer.w2_b,             f_ggml_ctx, ggml_get_name(layer.w2_b));
-        copy_tensor_by_name(layer.w3_a,             f_ggml_ctx, ggml_get_name(layer.w3_a));
-        copy_tensor_by_name(layer.w3_b,             f_ggml_ctx, ggml_get_name(layer.w3_b));
+        copy_tensor_by_name(layer.ffn_gate_a,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
+        copy_tensor_by_name(layer.ffn_gate_b,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
+        copy_tensor_by_name(layer.ffn_down_a,       f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
+        copy_tensor_by_name(layer.ffn_down_b,       f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
+        copy_tensor_by_name(layer.ffn_up_a,         f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
+        copy_tensor_by_name(layer.ffn_up_b,         f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
    }
 }

@@ -868,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V,       lora->hparams.n_rank_wv);
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,     lora->hparams.n_rank_wo);
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM,     lora->hparams.n_rank_ffn_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_w1);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_w2);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_w3);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_ffn_gate);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_ffn_down);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_ffn_up);

    gguf_add_tensor(fctx, lora->tok_embeddings_a);
    gguf_add_tensor(fctx, lora->tok_embeddings_b);
@@ -894,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
        gguf_add_tensor(fctx, layer.wo_b);
        gguf_add_tensor(fctx, layer.ffn_norm_a);
        gguf_add_tensor(fctx, layer.ffn_norm_b);
-        gguf_add_tensor(fctx, layer.w1_a);
-        gguf_add_tensor(fctx, layer.w1_b);
-        gguf_add_tensor(fctx, layer.w2_a);
-        gguf_add_tensor(fctx, layer.w2_b);
-        gguf_add_tensor(fctx, layer.w3_a);
-        gguf_add_tensor(fctx, layer.w3_b);
+        gguf_add_tensor(fctx, layer.ffn_gate_a);
+        gguf_add_tensor(fctx, layer.ffn_gate_b);
+        gguf_add_tensor(fctx, layer.ffn_down_a);
+        gguf_add_tensor(fctx, layer.ffn_down_b);
+        gguf_add_tensor(fctx, layer.ffn_up_a);
+        gguf_add_tensor(fctx, layer.ffn_up_b);
    }
 }

@@ -1104,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
        write_tensor(&file, layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraB"));
        write_tensor(&file, layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraA"));
        write_tensor(&file, layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w1_a,             tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
-        write_tensor(&file, layer.w1_b,             tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w2_a,             tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
-        write_tensor(&file, layer.w2_b,             tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
-        write_tensor(&file, layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
    }
 }

@@ -1139,9 +1139,9 @@ struct train_params {
    uint32_t n_rank_wv;
    uint32_t n_rank_wo;
    uint32_t n_rank_ffn_norm;
-    uint32_t n_rank_w1;
-    uint32_t n_rank_w2;
-    uint32_t n_rank_w3;
+    uint32_t n_rank_ffn_gate;
+    uint32_t n_rank_ffn_down;
+    uint32_t n_rank_ffn_up;
    uint32_t n_rank_tok_embeddings;
    uint32_t n_rank_norm;
    uint32_t n_rank_output;
@@ -1152,9 +1152,9 @@ struct train_params {
    bool custom_n_rank_wv;
    bool custom_n_rank_wo;
    bool custom_n_rank_ffn_norm;
-    bool custom_n_rank_w1;
-    bool custom_n_rank_w2;
-    bool custom_n_rank_w3;
+    bool custom_n_rank_ffn_gate;
+    bool custom_n_rank_ffn_down;
+    bool custom_n_rank_ffn_up;
    bool custom_n_rank_tok_embeddings;
    bool custom_n_rank_norm;
    bool custom_n_rank_output;
@@ -1186,9 +1186,9 @@ static struct train_params get_default_train_params() {
    params.n_rank_wv             = 4;
    params.n_rank_wo             = 4;
    params.n_rank_ffn_norm       = 1;
-    params.n_rank_w1             = 4;
-    params.n_rank_w2             = 4;
-    params.n_rank_w3             = 4;
+    params.n_rank_ffn_gate       = 4;
+    params.n_rank_ffn_down       = 4;
+    params.n_rank_ffn_up         = 4;
    params.n_rank_tok_embeddings = 4;
    params.n_rank_norm           = 1;
    params.n_rank_output         = 4;
@@ -1199,9 +1199,9 @@ static struct train_params get_default_train_params() {
    params.custom_n_rank_wv             = false;
    params.custom_n_rank_wo             = false;
    params.custom_n_rank_ffn_norm       = false;
-    params.custom_n_rank_w1             = false;
-    params.custom_n_rank_w2             = false;
-    params.custom_n_rank_w3             = false;
+    params.custom_n_rank_ffn_gate       = false;
+    params.custom_n_rank_ffn_down       = false;
+    params.custom_n_rank_ffn_up         = false;
    params.custom_n_rank_tok_embeddings = false;
    params.custom_n_rank_norm           = false;
    params.custom_n_rank_output         = false;
@@ -1232,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor, overrides default rank.\n");
    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor, overrides default rank.\n");
    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_gate N          LORA rank for ffn_gate tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_down N          LORA rank for ffn_down tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_up N            LORA rank for ffn_up tensor, overrides default rank.\n");

    print_common_train_usage(argc, argv, &params->common);
 }
@@ -1369,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
            }
            params->n_rank_wo = std::stoi(argv[i]);
            params->custom_n_rank_wo = true;
-        } else if (arg == "--rank-w1") {
+        } else if (arg == "--rank-ffn_gate") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w1 = std::stoi(argv[i]);
-            params->custom_n_rank_w1 = true;
-        } else if (arg == "--rank-w2") {
+            params->n_rank_ffn_gate = std::stoi(argv[i]);
+            params->custom_n_rank_ffn_gate = true;
+        } else if (arg == "--rank-ffn_down") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w2 = std::stoi(argv[i]);
-            params->custom_n_rank_w2 = true;
-        } else if (arg == "--rank-w3") {
+            params->n_rank_ffn_down = std::stoi(argv[i]);
+            params->custom_n_rank_ffn_down = true;
+        } else if (arg == "--rank-ffn_up") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w3 = std::stoi(argv[i]);
-            params->custom_n_rank_w3 = true;
+            params->n_rank_ffn_up = std::stoi(argv[i]);
+            params->custom_n_rank_ffn_up = true;
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            train_print_usage(argc, argv, &default_params);
@@ -1452,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
        nx += ggml_nelements(layer.wo_b);
        nx += ggml_nelements(layer.ffn_norm_a);
        nx += ggml_nelements(layer.ffn_norm_b);
-        nx += ggml_nelements(layer.w1_a);
-        nx += ggml_nelements(layer.w1_b);
-        nx += ggml_nelements(layer.w2_a);
-        nx += ggml_nelements(layer.w2_b);
-        nx += ggml_nelements(layer.w3_a);
-        nx += ggml_nelements(layer.w3_b);
+        nx += ggml_nelements(layer.ffn_gate_a);
+        nx += ggml_nelements(layer.ffn_gate_b);
+        nx += ggml_nelements(layer.ffn_down_a);
+        nx += ggml_nelements(layer.ffn_down_b);
+        nx += ggml_nelements(layer.ffn_up_a);
+        nx += ggml_nelements(layer.ffn_up_b);
    }
    return nx;
 }
@@ -1511,9 +1511,9 @@ int main(int argc, char ** argv) {
    uint32_t n_rank_wv                 = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
    uint32_t n_rank_wo                 = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
    uint32_t n_rank_ffn_norm           = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
-    uint32_t n_rank_w1                 = params.custom_n_rank_w1             ? params.n_rank_w1             : params.lora_r;
-    uint32_t n_rank_w2                 = params.custom_n_rank_w2             ? params.n_rank_w2             : params.lora_r;
-    uint32_t n_rank_w3                 = params.custom_n_rank_w3             ? params.n_rank_w3             : params.lora_r;
+    uint32_t n_rank_ffn_gate           = params.custom_n_rank_ffn_gate       ? params.n_rank_ffn_gate       : params.lora_r;
+    uint32_t n_rank_ffn_down           = params.custom_n_rank_ffn_down       ? params.n_rank_ffn_down       : params.lora_r;
+    uint32_t n_rank_ffn_up             = params.custom_n_rank_ffn_up         ? params.n_rank_ffn_up         : params.lora_r;
    uint32_t n_rank_tok_embeddings     = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
    uint32_t n_rank_norm               = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
    uint32_t n_rank_output             = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
@@ -1523,9 +1523,9 @@ int main(int argc, char ** argv) {
    lora.hparams.n_rank_wv             = n_rank_wv;
    lora.hparams.n_rank_wo             = n_rank_wo;
    lora.hparams.n_rank_ffn_norm       = n_rank_ffn_norm;
-    lora.hparams.n_rank_w1             = n_rank_w1;
-    lora.hparams.n_rank_w2             = n_rank_w2;
-    lora.hparams.n_rank_w3             = n_rank_w3;
+    lora.hparams.n_rank_ffn_gate       = n_rank_ffn_gate;
+    lora.hparams.n_rank_ffn_down       = n_rank_ffn_down;
+    lora.hparams.n_rank_ffn_up         = n_rank_ffn_up;
    lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
    lora.hparams.n_rank_norm           = n_rank_norm;
    lora.hparams.n_rank_output         = n_rank_output;
@@ -1566,9 +1566,9 @@ int main(int argc, char ** argv) {
        || (lora.hparams.n_rank_wv             != n_rank_wv)
        || (lora.hparams.n_rank_wo             != n_rank_wo)
        || (lora.hparams.n_rank_ffn_norm       != n_rank_ffn_norm)
-        || (lora.hparams.n_rank_w1             != n_rank_w1)
-        || (lora.hparams.n_rank_w2             != n_rank_w2)
-        || (lora.hparams.n_rank_w3             != n_rank_w3)
+        || (lora.hparams.n_rank_ffn_gate       != n_rank_ffn_gate)
+        || (lora.hparams.n_rank_ffn_down       != n_rank_ffn_down)
+        || (lora.hparams.n_rank_ffn_up         != n_rank_ffn_up)
        || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
        || (lora.hparams.n_rank_norm           != n_rank_norm)
        || (lora.hparams.n_rank_output         != n_rank_output)
@@ -50,9 +50,9 @@ struct my_llama_layer {
    struct ggml_tensor * ffn_norm;

    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
 };

 struct my_llama_model {
@@ -140,9 +140,9 @@ static void set_param_model(struct my_llama_model * model) {
        ggml_set_param(ctx, layer.wv);
        ggml_set_param(ctx, layer.wo);
        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
+        ggml_set_param(ctx, layer.ffn_gate);
+        ggml_set_param(ctx, layer.ffn_down);
+        ggml_set_param(ctx, layer.ffn_up);
    }
 }

@@ -198,9 +198,9 @@ static void init_model(struct my_llama_model * model) {

        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
+        layer.ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);

        ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));

@@ -211,9 +211,9 @@ static void init_model(struct my_llama_model * model) {

        ggml_set_name(layer.ffn_norm,       tni(LLM_TENSOR_FFN_NORM, i));

-        ggml_set_name(layer.w1,             tni(LLM_TENSOR_FFN_GATE, i));
-        ggml_set_name(layer.w2,             tni(LLM_TENSOR_FFN_DOWN, i));
-        ggml_set_name(layer.w3,             tni(LLM_TENSOR_FFN_UP, i));
+        ggml_set_name(layer.ffn_gate,       tni(LLM_TENSOR_FFN_GATE, i));
+        ggml_set_name(layer.ffn_down,       tni(LLM_TENSOR_FFN_DOWN, i));
+        ggml_set_name(layer.ffn_up,         tni(LLM_TENSOR_FFN_UP, i));
    }

    set_param_model(model);
@@ -244,9 +244,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,

        randomize_tensor_normal(layer.ffn_norm, rnd);

-        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, rnd);
+        randomize_tensor_normal(layer.ffn_gate, rnd);
+        randomize_tensor_normal(layer.ffn_down, rnd);
+        randomize_tensor_normal(layer.ffn_up,   rnd);
    }

    free_random_normal_distribution(rnd);
@@ -356,11 +356,11 @@ static struct ggml_tensor * llama_build_train_graphs(
        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, f_norm_rms_eps);                    set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.ffn_up, t24);                      set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.ffn_gate, t24);                    set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.ffn_down, t28);                    set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
        cur = t30;
        checkpoints.push_back(cur);
@@ -521,9 +521,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
        copy_tensor_by_name(layer.wv,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
        copy_tensor_by_name(layer.wo,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
        copy_tensor_by_name(layer.ffn_norm,       f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
-        copy_tensor_by_name(layer.w1,             f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
-        copy_tensor_by_name(layer.w2,             f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
-        copy_tensor_by_name(layer.w3,             f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
+        copy_tensor_by_name(layer.ffn_gate,       f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
+        copy_tensor_by_name(layer.ffn_down,       f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
+        copy_tensor_by_name(layer.ffn_up,         f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
    }
 }

@@ -664,9 +664,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
        gguf_add_tensor(fctx, layer.wv);
        gguf_add_tensor(fctx, layer.wo);
        gguf_add_tensor(fctx, layer.ffn_norm);
-        gguf_add_tensor(fctx, layer.w1);
-        gguf_add_tensor(fctx, layer.w2);
-        gguf_add_tensor(fctx, layer.w3);
+        gguf_add_tensor(fctx, layer.ffn_gate);
+        gguf_add_tensor(fctx, layer.ffn_down);
+        gguf_add_tensor(fctx, layer.ffn_up);
    }
 }

@@ -915,9 +915,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
        nx += ggml_nelements(layer.wv);
        nx += ggml_nelements(layer.wo);
        nx += ggml_nelements(layer.ffn_norm);
-        nx += ggml_nelements(layer.w1);
-        nx += ggml_nelements(layer.w2);
-        nx += ggml_nelements(layer.w3);
+        nx += ggml_nelements(layer.ffn_gate);
+        nx += ggml_nelements(layer.ffn_down);
+        nx += ggml_nelements(layer.ffn_up);
    }
    return nx;
 }
@@ -40,6 +40,7 @@ class Keys:
        TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
        EXPERT_COUNT          = "{arch}.expert_count"
        EXPERT_USED_COUNT     = "{arch}.expert_used_count"
+        POOLING_LAYER         = "{arch}.pooling_layer"

    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
@@ -360,6 +360,9 @@ class GGUFWriter:
    def add_causal_attention(self, value: bool) -> None:
        self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)

+    def add_pooling_layer(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
+
    def add_rope_dimension_count(self, count: int) -> None:
        self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)

@@ -254,6 +254,7 @@ enum llm_kv {
    LLM_KV_TENSOR_DATA_LAYOUT,
    LLM_KV_EXPERT_COUNT,
    LLM_KV_EXPERT_USED_COUNT,
+    LLM_KV_POOLING_LAYER,

    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -311,6 +312,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TENSOR_DATA_LAYOUT,            "%s.tensor_data_layout"    },
    { LLM_KV_EXPERT_COUNT,                  "%s.expert_count"          },
    { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
+    { LLM_KV_POOLING_LAYER,                 "%s.pooling_layer"         },

    { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@@ -1539,6 +1541,7 @@ struct llama_hparams {
    float f_max_alibi_bias;

    bool causal_attn = true;
+    bool pooling_layer = false;


    bool operator!=(const llama_hparams & other) const {
@@ -1601,6 +1604,7 @@ struct llama_cparams {

    bool mul_mat_q;
    bool offload_kqv;
+    bool do_pooling;

    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
@@ -1896,7 +1900,7 @@ struct llama_context {
    struct ggml_tensor * inp_pos;       // I32 [n_batch]
    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
-    struct ggml_tensor * inp_sum;       // F32 [1, n_batch]
+    struct ggml_tensor * inp_sum;       // F32 [n_batch, n_batch]

 #ifdef GGML_USE_MPI
    ggml_mpi_context * ctx_mpi = NULL;
@@ -3053,6 +3057,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
                ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
+                ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);

                switch (hparams.n_layer) {
                    case 3:
@@ -3309,7 +3314,12 @@ static void llm_load_vocab(

    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+        try {
+            vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+            vocab.linefeed_id = vocab.special_pad_id;
+        }
    } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
        vocab.linefeed_id = vocab.special_pad_id;
    } else {
@@ -4379,9 +4389,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam

        model.hparams.vocab_only = params.vocab_only;

-        llm_load_arch   (ml, model);
-        llm_load_hparams(ml, model);
-        llm_load_vocab  (ml, model);
+        try {
+            llm_load_arch(ml, model);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
+        }
+        try {
+            llm_load_hparams(ml, model);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
+        }
+        try {
+            llm_load_vocab(ml, model);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
+        }

        llm_load_print_meta(ml, model);

@@ -4859,7 +4881,7 @@ struct llm_build_context {
    const int32_t n_orig_ctx;

    const bool do_rope_shift;
-    const bool causal_attn;
+    const bool do_pooling;

    const llm_build_cb & cb;

@@ -4903,7 +4925,7 @@ struct llm_build_context {
        kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
        n_orig_ctx       (cparams.n_yarn_orig_ctx),
        do_rope_shift    (worst_case || kv_self.has_shift),
-        causal_attn      (hparams.causal_attn),
+        do_pooling       (hparams.pooling_layer && cparams.do_pooling),
        cb               (cb),
        buf_compute_meta (lctx.buf_compute_meta) {
            // all initializations should be done in init()
@@ -5752,17 +5774,18 @@ struct llm_build_context {

        const int64_t n_embd_head = hparams.n_embd_head_v;
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);

        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;

        // get input vectors with right size
+        const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0);
+        struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);

        // construct input embeddings (token, type, position)
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
+
        // token types are hardcoded to zero ("Sentence A")
        struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
        inpL = ggml_add(ctx0, inpL, type_row0);
@@ -5832,9 +5855,11 @@ struct llm_build_context {
        // final output
        cur = inpL;

-        // pooling
-        cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
-        cb(cur, "result_embed", -1);
+        // pooling layer
+        if (do_pooling) {
+            cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
+        }
+        cb(cur, "result_embd", -1);

        ggml_build_forward_expand(gf, cur);

@@ -7367,7 +7392,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {

                for (int i = 0; i < n_kv; ++i) {
                    float f;
-                    if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
+                    if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
+                        (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
                        f = -INFINITY;
                    } else {
                        f = 0;
@@ -7378,7 +7404,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        }
    }

-
    {
        assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
        float * data = (float *) lctx.inp_sum->data;
@@ -7399,6 +7424,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
            data[i] = lctx.kv_self.cells[i].delta;
        }
    }
+
+    if (hparams.pooling_layer && cparams.do_pooling) {
+        const int64_t n_tokens = batch.n_tokens;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
+        float * data = (float *) lctx.inp_sum->data;
+
+        memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
+
+        for (int i = 0; i < n_tokens; ++i) {
+            const llama_seq_id seq_id = batch.seq_id[i][0];
+            data[seq_id*n_tokens + i] = 1.0f;
+        }
+    }
 }

 // decode a batch of tokens by evaluating the transformer
@@ -7510,7 +7549,7 @@ static int llama_decode_internal(
            embeddings = gf->nodes[gf->n_nodes - 3];
            GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
        }
-    } else if (strcmp(res->name, "result_embed") == 0) {
+    } else if (strcmp(res->name, "result_embd") == 0) {
        embeddings = res;
        res = nullptr;
    } else {
@@ -7630,11 +7669,12 @@ static int llama_decode_internal(
    if (!lctx.embedding.empty()) {
        auto & embedding_out = lctx.embedding;

-        const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0;
+        const int64_t embd_pos  = res ? n_embd * (n_tokens-1) : 0;
+        const int64_t embd_size = res ? n_embd : n_embd * n_tokens;

-        embedding_out.resize(n_embd);
+        embedding_out.resize(embd_size);
        ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
-        ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float));
+        ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
        ggml_backend_synchronize(embeddings_backend);
    }

@@ -7711,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
    switch (llama_vocab_get_type(vocab)) {
        case LLAMA_VOCAB_TYPE_SPM: {
            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
-            return vocab.token_to_id.at(buf);
+            auto token = vocab.token_to_id.find(buf);
+            if (token != vocab.token_to_id.end()) {
+                return (*token).second;
+            }
+            // Try to fall back to just the byte as a string
+            const char buf2[2] = { (char)ch, 0 };
+            return vocab.token_to_id.at(buf2);
        }
        case LLAMA_VOCAB_TYPE_WPM:
        case LLAMA_VOCAB_TYPE_BPE: {
@@ -7759,7 +7805,7 @@ struct llm_bigram_spm {
 };

 struct llm_tokenizer_spm {
-    llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
+    llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}

    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        // split string into utf8 chars
@@ -7834,6 +7880,7 @@ private:

        if (p == rev_merge.end()) {
            // output any symbols that did not form tokens as bytes.
+            output.reserve(output.size() + symbol.n);
            for (int j = 0; j < (int)symbol.n; ++j) {
                llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
                output.push_back(token_id);
@@ -8396,17 +8443,18 @@ struct fragment_buffer_variant {
        token(_token),
        raw_text(_dummy),
        offset(0),
-        length(0){}
+        length(0) {}
+
    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
    :
        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
-        token((llama_vocab::id)-1),
+        token((llama_vocab::id) - 1),
        raw_text(_raw_text),
        offset(_offset),
        length(_length){
-            GGML_ASSERT( _offset >= 0 );
-            GGML_ASSERT( _length >= 1 );
-            GGML_ASSERT( offset + length <= raw_text.length() );
+            GGML_ASSERT(_offset >= 0);
+            GGML_ASSERT(_length >= 1);
+            GGML_ASSERT(offset + length <= raw_text.length());
        }

    const FRAGMENT_BUFFER_VARIANT_TYPE type;
@@ -8530,14 +8578,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
    }

    std::forward_list<fragment_buffer_variant> fragment_buffer;
-    fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
+    fragment_buffer.emplace_front(raw_text, 0, raw_text.length());

-    if (special) tokenizer_st_partition( vocab, fragment_buffer );
+    if (special) tokenizer_st_partition(vocab, fragment_buffer);

    switch (vocab.type) {
        case LLAMA_VOCAB_TYPE_SPM:
            {
-                for (const auto & fragment: fragment_buffer) {
+                for (const auto & fragment : fragment_buffer) {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                        // without adding this leading whitespace, we do not get the same results as the original tokenizer

@@ -8565,7 +8613,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
            } break;
        case LLAMA_VOCAB_TYPE_BPE:
            {
-                for (const auto & fragment: fragment_buffer) {
+                for (const auto & fragment : fragment_buffer) {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);

@@ -8581,7 +8629,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
            } break;
        case LLAMA_VOCAB_TYPE_WPM:
            {
-                for (const auto & fragment: fragment_buffer) {
+                for (const auto & fragment : fragment_buffer) {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);

@@ -10444,7 +10492,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        quantize &= !params->only_copy;

        // do not quantize expert gating tensors
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
+
+        // do not quantize positional embeddings and token types (BERT)
+        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
+        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");

        enum ggml_type new_type;
        void * new_data;
@@ -10946,6 +10998,7 @@ struct llama_context_params llama_context_default_params() {
        /*.logits_all                  =*/ false,
        /*.embedding                   =*/ false,
        /*.offload_kqv                 =*/ true,
+        /*.do_pooling                  =*/ true,
    };

    return result;
@@ -11101,6 +11154,7 @@ struct llama_context * llama_new_context_with_model(
    cparams.yarn_beta_slow   = params.yarn_beta_slow;
    cparams.mul_mat_q        = params.mul_mat_q;
    cparams.offload_kqv      = params.offload_kqv;
+    cparams.do_pooling       = params.do_pooling;

    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
@@ -11248,7 +11302,7 @@ struct llama_context * llama_new_context_with_model(
        // resized during inference, reserve maximum
        ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);

-        if (params.embedding){
+        if (params.embedding) {
            ctx->embedding.resize(hparams.n_embd);
        }

@@ -11266,7 +11320,7 @@ struct llama_context * llama_new_context_with_model(
            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
-            ctx->inp_sum     = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
+            ctx->inp_sum     = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);

            ggml_set_name(ctx->inp_tokens,  "inp_tokens");
            ggml_set_name(ctx->inp_embd,    "inp_embd");
@@ -12124,6 +12178,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }

+float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
+    return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
+}
+
 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
    return model->vocab.id_to_token[token].text.c_str();
 }
@@ -236,6 +236,7 @@ extern "C" {
        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        bool embedding;   // embedding mode only
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
    };

    // model quantization parameters
@@ -628,6 +629,10 @@ extern "C" {
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

+    // Get the embeddings for the ith sequence
+    // llama_get_embeddings(ctx) + i*n_embd
+    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
+
    //
    // Vocab
    //
@@ -4,13 +4,13 @@
 #include "console.h"

 #include <cassert>
+#include <codecvt>
 #include <cstdio>
 #include <cstring>
-#include <string>
-#include <codecvt>
-#include <map>
-#include <vector>
 #include <locale>
+#include <string>
+#include <thread>
+#include <vector>

 int main(int argc, char **argv) {
    if (argc < 2) {
@@ -74,45 +74,46 @@ int main(int argc, char **argv) {
            }
        }
        catch (const std::invalid_argument &) {
-            fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
+            //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
        }
    }

-    for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
-        // NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
-        if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
-            std::string str = " " + codepoint_to_utf8(cp);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_bpe(ctx, tokens);
-            if (str != check) {
-                fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                return 3;
-            }
-        }
-    }
-    // Restrict to assigned unicode planes
-    // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-    for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
-        }
-    }
-    for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx]() {
+                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
+                    if (!( // NOLINT
+                                (cp < 0x03       || cp >  0x05)   && cp != 0x0b && cp != 0x11 &&
+                                (cp < 0x13       || cp >  0x17)   && cp != 0x19 &&
+                                (cp < 0x1c       || cp >  0x1e)   &&
+                                (cp < 0xd800     || cp >  0xdfff) &&
+                                (cp < 0x00040000 || cp >= 0x000e0000)
+                        )) {
+                        continue;
+                    }
+
+                    std::string str = codepoint_to_utf8(cp);
+                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+                    std::string check = llama_detokenize_bpe(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        std::exit(3);
+                    }
+                }
+            });
+        }
+
+        for (auto & t : threads) {
+            t.join();
        }
    }
+
    llama_free_model(model);
    llama_free(ctx);

@@ -4,13 +4,13 @@
 #include "console.h"

 #include <cassert>
+#include <codecvt>
 #include <cstdio>
 #include <cstring>
-#include <string>
-#include <codecvt>
-#include <map>
-#include <vector>
 #include <locale>
+#include <string>
+#include <thread>
+#include <vector>

 int main(int argc, char **argv) {
    if (argc < 2) {
@@ -72,26 +72,33 @@ int main(int argc, char **argv) {
        }
    }

-    for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
-        if (cp < 0xd800 || cp > 0xdfff) {
-            std::string str = codepoint_to_utf8(cp);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_spm(ctx, tokens);
-            if (cp != 9601 && str != check) {
-                fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                return 3;
-            }
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx]() {
+                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
+                    if (cp >= 0xd800 && cp <= 0xdfff) {
+                        continue;
+                    }
+
+                    std::string str = codepoint_to_utf8(cp);
+                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+                    std::string check = llama_detokenize_spm(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        std::exit(3);
+                    }
+                }
+            });
        }
-    }
-    for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_spm(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
+
+        for (auto & t : threads) {
+            t.join();
        }
    }

@@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
        offset += 1;
        return result;
    }
-    else if (!(utf8[offset + 0] & 0x40)) {
+    if (!(utf8[offset + 0] & 0x40)) {
        throw std::invalid_argument("invalid character");
    }
-    else if (!(utf8[offset + 0] & 0x20)) {
-        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
+    if (!(utf8[offset + 0] & 0x20)) {
+        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
            throw std::invalid_argument("invalid character");
+        }
        auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
        offset += 2;
        return result;
    }
-    else if (!(utf8[offset + 0] & 0x10)) {
-        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
+    if (!(utf8[offset + 0] & 0x10)) {
+        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
            throw std::invalid_argument("invalid character");
+        }
        auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
        offset += 3;
        return result;
    }
-    else if (!(utf8[offset + 0] & 0x08)) {
-        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
+    if (!(utf8[offset + 0] & 0x08)) {
+        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
            throw std::invalid_argument("invalid character");
+        }
        auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
        offset += 4;
        return result;
@@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
        offset += 1;
        return result;
    }
-    else {
-        if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
-            throw std::invalid_argument("invalid character");
-        auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
-        offset += 2;
-        return result;
+
+    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
+        throw std::invalid_argument("invalid character");
    }
-    throw std::invalid_argument("invalid string");
+
+    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
+    offset += 2;
+    return result;
 }

 static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
    std::vector<uint32_t> result;
    size_t offset = 0;
-    while (offset < utf16.size())
+    while (offset < utf16.size()) {
        result.push_back(codepoint_from_utf16(utf16, offset));
+    }
    return result;
 }

@@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
 static std::unordered_map<uint32_t, int> codepoint_type_map() {
    std::unordered_map<uint32_t, int> codepoint_types;
    for (auto p : digit_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+        for (auto i = p.first; i <= p.second; ++ i) {
            codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
+        }
    }
-    for(auto p : letter_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : letter_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            codepoint_types[i] = CODEPOINT_TYPE_LETTER;
+        }
    }
-    for(auto p : whitespace_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : whitespace_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
+        }
    }
-    for(auto p : accent_mark_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : accent_mark_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
+        }
    }
-    for(auto p : punctuation_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : punctuation_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
+        }
    }
-    for (auto p : symbol_ranges) {
-        for (auto i = p.first; i <= p.second; ++i)
+    for  (auto p : symbol_ranges) {
+        for (auto i = p.first; i <= p.second; ++i) {
            codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
+        }
    }
-    for(auto p : control_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : control_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
+        }
    }
    return codepoint_types;
 }

 static int codepoint_type(uint32_t cp) {
    static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
-    return codepoint_types[cp];
+    return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
 }

 static int codepoint_type(const std::string & utf8) {
-    if (utf8.length() == 0)
+    if (utf8.length() == 0) {
        return CODEPOINT_TYPE_UNIDENTIFIED;
+    }
    size_t offset = 0;
    return codepoint_type(codepoint_from_utf8(utf8, offset));
 }
Author	SHA1	Message	Date
Aarni Koskela	c4e6dd59e4	llama : allow raw byte in SPM vocabs; don't crash on nl 404 (#5478 ) * common : don't crash if newline token is not found * common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs	2024-02-13 18:18:16 +02:00
Aarni Koskela	037259be68	llama : make load error reporting more granular (#5477 ) Makes it easier to pinpoint where e.g. `unordered_map::at: key not found` comes from.	2024-02-13 15:24:50 +02:00
Daniel Bevenius	263978904c	finetune : rename feed-forward tensors (w1/w2/w3) (#4839 ) * finetune: rename feed-forward tensors (w1/w2/w3) This commit renames the feed-forward tensors w1, w2 and w3 to ffn_gate, ffn_down and ffn_up respectively. The motivation for this change is to make it easier to understand the purpose of the tensors. This also seems to be inline with the names used in the llama_layer struct in llama.cpp. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com> * train-text-from-scratch: rename ff tensors This commit renames the feed-forward tensors w1, w2 and w3 to ffn_gate, ffn_down and ffn_up respectively. The motivation for this change is to make it easier to understand the purpose of the tensors. This also seems to be inline with the names used in the llama_layer struct in llama.cpp Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com> --------- Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-02-13 15:15:42 +02:00
Georgi Gerganov	cf45252a7c	tests : multi-thread the tokenizer tests (#5474 ) * tests : multi-thread the tokenizer tests ggml-ci * unicode : fix data race for unidentified codepoints ggml-ci * unicode : minor style fixes ggml-ci	2024-02-13 15:14:22 +02:00
Douglas Hanley	03bf161eb6	llama : support batched embeddings (#5466 ) * batched embedding: pool outputs by sequence id. updated embedding example * bring back non-causal attention * embd : minor improvements * llama : minor --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-02-13 14:06:58 +02:00
Johannes Gäßler	ad014bba97	make: add error message for bad CUDA version (#5444 ) * make: add error message for bad CUDA version * Update Makefile Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> --------- Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>	2024-02-13 12:38:37 +01:00
Georgi Gerganov	49cc1f7d67	bert : add tests + fix quantization (#5475 ) * llama : do not quantize pos embd and token type tensors * ci : add BERT tests ggml-ci * ci : do not do BERT tests on low-perf nodes ggml-ci	2024-02-13 13:01:29 +02:00