Merge branch 'master' into speculative-tree

ggml-ci
2026-06-14 17:56:43 +02:00 · 2023-10-18 10:50:58 +03:00 · 2023-10-17 19:31:40 +03:00 · 2023-10-17 17:24:11 +03:00 · 2023-10-17 17:04:31 +03:00 · 2023-10-17 11:40:26 +03:00
48 changed files with 4824 additions and 933 deletions
@@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search  \
+	simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search  \
 	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o

 # Binaries only useful for tests
@@ -608,6 +608,13 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)

+$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
+
+embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
+
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

@@ -10,9 +10,13 @@
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics
-
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
 - ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
+- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
+- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
+  **Devs should become familiar with the new API**
+- Local Falcon 180B inference on Mac Studio
+
+  https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e

 ----

@@ -962,6 +966,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /

 - [main](./examples/main/README.md)
 - [server](./examples/server/README.md)
+- [embd-input](./examples/embd-input/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
@@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    std::string arg;
    gpt_params default_params;
    const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sparams;
+    llama_sampling_params & sparams = params.sampling_params;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@@ -241,26 +241,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            sparams.penalty_last_n = std::stoi(argv[i]);
-            sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
+            sparams.repeat_last_n = std::stoi(argv[i]);
        } else if (arg == "--repeat-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.penalty_repeat = std::stof(argv[i]);
+            sparams.repeat_penalty = std::stof(argv[i]);
        } else if (arg == "--frequency-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.penalty_freq = std::stof(argv[i]);
+            sparams.frequency_penalty = std::stof(argv[i]);
        } else if (arg == "--presence-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.penalty_present = std::stof(argv[i]);
+            sparams.presence_penalty = std::stof(argv[i]);
        } else if (arg == "--mirostat") {
            if (++i >= argc) {
                invalid_param = true;
@@ -573,7 +572,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            sparams.grammar = argv[i];
+            params.grammar = argv[i];
        } else if (arg == "--grammar-file") {
            if (++i >= argc) {
                invalid_param = true;
@@ -588,7 +587,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            std::copy(
                std::istreambuf_iterator<char>(file),
                std::istreambuf_iterator<char>(),
-                std::back_inserter(sparams.grammar)
+                std::back_inserter(params.grammar)
            );
 #ifndef LOG_DISABLE_LOGS
        // Parse args for logging parameters
@@ -641,7 +640,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }

 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sparams;
+    const llama_sampling_params & sparams = params.sampling_params;

    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
@@ -679,10 +678,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
    printf("  --mirostat N          use Mirostat sampling.\n");
    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@@ -879,7 +878,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    }

    if (params.ignore_eos) {
-        params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
    }

    {
@@ -1124,28 +1123,28 @@ std::string get_sortable_timestamp() {

 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sparams;
+    const llama_sampling_params & sparams = params.sampling_params;

    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
-    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
-    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
-    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
+    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
-    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
-    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
-    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
-    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
-    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
-    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
-    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
-    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
-    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
+    fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
+    fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
+    fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
+    fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
+    fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
+    fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
+    fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
+    fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
+    fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
+    fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");

 #ifdef NDEBUG
    fprintf(stream, "debug: false\n");
@@ -1179,8 +1178,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
-    dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
+    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
@@ -1239,14 +1238,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
    dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
    dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);

    fprintf(stream, "reverse_prompt:\n");
    for (std::string ap : params.antiprompt) {
@@ -56,7 +56,7 @@ struct gpt_params {
    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor

    // // sampling parameters
-    struct llama_sampling_params sparams;
+    struct llama_sampling_params sampling_params;

    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_draft       = "";                              // draft model for speculative decoding
@@ -66,6 +66,7 @@ struct gpt_params {
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
+    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files

@@ -399,7 +399,7 @@ namespace grammar_parser {
    void print_grammar(FILE * file, const parse_state & state) {
        try {
            std::map<uint32_t, std::string> symbol_id_names;
-            for (const auto & kv : state.symbol_ids) {
+            for (auto kv : state.symbol_ids) {
                symbol_id_names[kv.second] = kv.first;
            }
            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
@@ -1,9 +1,9 @@
 #include "sampling.h"

-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
+struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
    struct llama_sampling_context * result = new llama_sampling_context();

-    result->params  = params;
+    result->params = params.sampling_params;
    result->grammar = nullptr;

    // if there is a grammar, parse it
@@ -23,7 +23,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
    }

-    result->prev.resize(params.n_prev);
+    result->prev.resize(params.n_ctx);

    return result;
 }
@@ -66,56 +66,25 @@ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * ds
    dst->prev = src->prev;
 }

-llama_token llama_sampling_last(llama_sampling_context * ctx) {
-    return ctx->prev.back();
-}
-
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
-    const int size = ctx_sampling->prev.size();
-
-    n = std::min(n, size);
-
-    std::string result;
-
-    for (int i = size - n; i < size; i++) {
-        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
-    }
-
-    return result;
-}
-
-std::string llama_sampling_print(const llama_sampling_params & params) {
-    char result[1024];
-
-    snprintf(result, sizeof(result),
-            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
-
-    return std::string(result);
-}
-
 llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
+    const int n_ctx   = llama_n_ctx(ctx_main);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

+    const llama_sampling_params & params = ctx_sampling->params;
+
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
+    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+    const float   repeat_penalty  = params.repeat_penalty;
+    const float   alpha_presence  = params.presence_penalty;
+    const float   alpha_frequency = params.frequency_penalty;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
@@ -128,7 +97,7 @@ llama_token llama_sampling_sample(

    float * logits = llama_get_logits_ith(ctx_main, idx);

-    // apply params.logit_bias map
+    // Apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
@@ -148,10 +117,14 @@ llama_token llama_sampling_sample(
    // apply penalties
    if (!prev.empty()) {
        const float nl_logit = logits[llama_token_nl(ctx_main)];
+        const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);

-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                prev.data() + prev.size() - penalty_last_n,
-                penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
+        llama_sample_repetition_penalty(ctx_main, &cur_p,
+                prev.data() + prev.size() - last_n_repeat,
+                last_n_repeat, repeat_penalty);
+        llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
+                prev.data() + prev.size() - last_n_repeat,
+                last_n_repeat, alpha_frequency, alpha_presence);

        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
@@ -168,7 +141,7 @@ llama_token llama_sampling_sample(
    }

    if (temp <= 0) {
-        // greedy sampling
+        // Greedy sampling
        id = llama_sample_token_greedy(ctx_main, &cur_p);
    } else {
        if (mirostat == 1) {
@@ -179,9 +152,8 @@ llama_token llama_sampling_sample(
            llama_sample_temp(ctx_main, &cur_p, temp);
            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
        } else {
-            // temperature sampling
+            // Temperature sampling
            size_t min_keep = std::max(1, params.n_probs);
-
            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
@@ -211,12 +183,11 @@ llama_token llama_sampling_sample(
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar) {
+        llama_token id) {
    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
    ctx_sampling->prev.push_back(id);

-    if (ctx_sampling->grammar != NULL && apply_grammar) {
+    if (ctx_sampling->grammar != NULL) {
        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
    }
 }
@@ -10,30 +10,30 @@

 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.10f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
+    float   repeat_penalty    = 1.10f; // 1.0 = disabled
+    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.00f; // 0.0 = disabled
+    float   presence_penalty  = 0.00f; // 0.0 = disabled
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
+
    bool    penalize_nl       = true;  // consider newlines as a repeatable token

-    std::string grammar;  // optional BNF-like grammar to constrain sampling
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.

    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt; // string to help guidance
-    float       cfg_scale     = 1.f; // how strong is guidance
+    std::string cfg_negative_prompt;   // string to help guidance
+    float       cfg_scale     = 1.f;   // How strong is guidance

    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
 } llama_sampling_params;

 // general sampler context
@@ -58,7 +58,7 @@ struct llama_sampling_context {
 #include "common.h"

 // Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
+struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params);

 void llama_sampling_free(struct llama_sampling_context * ctx);

@@ -70,15 +70,6 @@ void llama_sampling_reset(llama_sampling_context * ctx);
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);

-// Get the last sampled token
-llama_token llama_sampling_last(llama_sampling_context * ctx);
-
-// Get a string representation of the last sampled tokens
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
-
-// Print sampling parameters into a string
-std::string llama_sampling_print(const llama_sampling_params & params);
-
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
@@ -105,5 +96,4 @@ llama_token llama_sampling_sample(
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar);
+        llama_token id);
@@ -1425,7 +1425,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc

        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
        if (impr_plot > 0) impr_plot = 0;
-        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
+        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
            *sched, opt->loss_after);
@@ -76,7 +76,6 @@ def parse_args() -> argparse.Namespace:
        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
        help="output format - use 0 for float32, 1 for float16",
    )
-    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
    return parser.parse_args()

 args = parse_args()
@@ -87,11 +86,6 @@ if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)

-endianess = gguf.GGUFEndian.LITTLE
-if args.bigendian:
-    endianess = gguf.GGUFEndian.BIG
-endianess_str = "Big Endian" if args.bigendian else "Little Endian"
-print(f"gguf: Conversion Endianess {endianess}")
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -119,7 +113,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
 num_parts = count_model_parts(dir_model)
 print(f"num_parts:{num_parts}\n")
 ARCH=gguf.MODEL_ARCH.BAICHUAN
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])

 print("gguf: get model metadata")

@@ -78,7 +78,7 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

-if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
+if hparams["architectures"][0] != "FalconForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])

    sys.exit(1)
@@ -97,17 +97,7 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])

 print("gguf: get model metadata")

-block_count = hparams.get("num_hidden_layers")
-if block_count is None:
-    block_count = hparams["n_layer"]  # old name
-
-n_head = hparams.get("num_attention_heads")
-if n_head is None:
-    n_head = hparams["n_head"]  # old name
-
-n_head_kv = hparams.get("num_kv_heads")
-if n_head_kv is None:
-    n_head_kv = hparams.get("n_head_kv", 1)  # old name
+block_count = hparams["num_hidden_layers"]

 gguf_writer.add_name("Falcon")
 gguf_writer.add_context_length(2048) # not in config.json
@@ -115,8 +105,11 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(n_head)
-gguf_writer.add_head_count_kv(n_head_kv)
+gguf_writer.add_head_count(hparams["num_attention_heads"])
+if "num_kv_heads" in hparams:
+    gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
+else:
+    gguf_writer.add_head_count_kv(1)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)

@@ -159,6 +152,10 @@ special_vocab.add_to_gguf(gguf_writer)

 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)

+# params for qkv transform
+n_head    = hparams["num_attention_heads"]
+n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
+
 head_dim = hparams["hidden_size"] // n_head

 # tensor info
@@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:


 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+    def __init__(self, fname_out: Path) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])

    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
@@ -875,10 +875,10 @@ class OutputFile:
        self.gguf.close()

    @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
        check_vocab_size(params, vocab)

-        of = OutputFile(fname_out, endianess=endianess)
+        of = OutputFile(fname_out)

        # meta data
        of.add_meta_arch(params)
@@ -903,10 +903,10 @@ class OutputFile:
        return dt.quantize(arr)

    @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
        check_vocab_size(params, vocab)

-        of = OutputFile(fname_out, endianess=endianess)
+        of = OutputFile(fname_out)

        # meta data
        of.add_meta_arch(params)
@@ -1123,9 +1123,8 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
-    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
-
    args = parser.parse_args(args_in)
+
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
@@ -1139,9 +1138,6 @@ def main(args_in: list[str] | None = None) -> None:
    if args.dump:
        do_dump_model(model_plus)
        return
-    endianess = gguf.GGUFEndian.LITTLE
-    if args.bigendian:
-        endianess = gguf.GGUFEndian.BIG

    params = Params.load(model_plus)
    if params.n_ctx == -1:
@@ -1189,7 +1185,7 @@ def main(args_in: list[str] | None = None) -> None:
    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")

-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
    print(f"Wrote {outfile}")


@@ -12,26 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 if (EMSCRIPTEN)
 else()
-    add_subdirectory(baby-llama)
-    add_subdirectory(batched)
-    add_subdirectory(batched-bench)
-    add_subdirectory(beam-search)
-    add_subdirectory(benchmark)
-    add_subdirectory(convert-llama2c-to-ggml)
-    add_subdirectory(embedding)
-    add_subdirectory(finetune)
-    add_subdirectory(infill)
-    add_subdirectory(llama-bench)
-    add_subdirectory(llava)
    add_subdirectory(main)
-    add_subdirectory(parallel)
-    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
+    add_subdirectory(perplexity)
+    add_subdirectory(embedding)
    add_subdirectory(save-load-state)
-    add_subdirectory(simple)
-    add_subdirectory(speculative)
+    add_subdirectory(benchmark)
+    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
+    add_subdirectory(finetune)
+    add_subdirectory(convert-llama2c-to-ggml)
+    add_subdirectory(simple)
+    add_subdirectory(batched)
+    add_subdirectory(batched-bench)
+    add_subdirectory(speculative)
+    add_subdirectory(parallel)
+    add_subdirectory(embd-input)
+    add_subdirectory(llava)
+    add_subdirectory(llama-bench)
+    add_subdirectory(beam-search)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
@@ -11,16 +11,12 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
        return 1 ;
    }

-    // number of parallel batches
    int n_parallel = 1;

-    // total length of the sequences including the prompt
-    int n_len = 32;
-
    if (argc >= 2) {
        params.model = argv[1];
    }
@@ -33,14 +29,13 @@ int main(int argc, char ** argv) {
        n_parallel = std::atoi(argv[3]);
    }

-    if (argc >= 5) {
-        n_len = std::atoi(argv[4]);
-    }
-
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }

+    // total length of the sequences including the prompt
+    const int n_len = 32;
+
    // init LLM

    llama_backend_init(params.numa);
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
    if (file.size < 4) {
        return false;
    }
-    std::string magic = file.read_string(4);
+    uint32_t magic = file.read_u32();
    return magic == GGUF_MAGIC;
 }

@@ -0,0 +1,4 @@
+PandaGPT
+MiniGPT-4
+*.pth
+
@@ -0,0 +1,17 @@
+set(TARGET embdinput)
+add_library(${TARGET} embd-input-lib.cpp embd-input.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
+
+set(TARGET embd-input-test)
+add_executable(${TARGET} embd-input-test.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
@@ -0,0 +1,63 @@
+### Examples for input embedding directly
+
+## Requirement
+build  `libembdinput.so`
+run the following comman in main dir (../../).
+```
+make
+```
+
+## [LLaVA](https://github.com/haotian-liu/LLaVA/) example  (llava.py)
+
+1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
+2. Convert it to ggml format.
+3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
+
+```
+import torch
+
+bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
+pth_path = "./examples/embd-input/llava_projection.pth"
+
+dic = torch.load(bin_path)
+used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
+torch.save({k: dic[k] for k in used_key}, pth_path)
+```
+4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
+
+
+## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
+
+1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
+The `adapter_config.json` is
+```
+{
+  "peft_type": "LORA",
+  "fan_in_fan_out": false,
+  "bias": null,
+  "modules_to_save": null,
+  "r": 32,
+  "lora_alpha": 32,
+  "lora_dropout": 0.1,
+  "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
+}
+```
+2. Papare the `vicuna` v0 model.
+3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
+4. Clone the PandaGPT source.
+```
+git clone https://github.com/yxuansu/PandaGPT
+```
+5. Install the requirement of PandaGPT.
+6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
+
+## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
+
+1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
+2. Clone the MiniGPT-4 source.
+```
+git clone https://github.com/Vision-CAIR/MiniGPT-4/
+```
+3. Install the requirement of PandaGPT.
+4. Papare the `vicuna` v0 model.
+5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
@@ -0,0 +1,221 @@
+#include "build-info.h"
+#include "common.h"
+#include "embd-input.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+static llama_context ** g_ctx;
+
+extern "C" {
+
+struct MyModel* create_mymodel(int argc, char ** argv) {
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return nullptr;
+    }
+
+    print_build_info();
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = uint32_t(time(NULL));
+    }
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
+
+    llama_backend_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+
+    g_ctx = &ctx;
+
+    // load the model and apply lora adapter, if any
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return nullptr;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+    }
+    struct MyModel * ret = new MyModel();
+    ret->ctx = ctx;
+    ret->params = params;
+    ret->n_past = 0;
+    // printf("ctx: %d\n", ret->ctx);
+    return ret;
+}
+
+void free_mymodel(struct MyModel * mymodel) {
+    llama_context * ctx = mymodel->ctx;
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    delete mymodel;
+}
+
+
+bool eval_float(void * model, float * input, int N){
+    MyModel * mymodel = (MyModel*)model;
+    llama_context * ctx = mymodel->ctx;
+    gpt_params params = mymodel->params;
+    int n_emb = llama_n_embd(llama_get_model(ctx));
+    int n_past = mymodel->n_past;
+    int n_batch = N; // params.n_batch;
+
+    for (int i = 0; i < (int) N; i += n_batch) {
+        int n_eval = (int) N - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+        if (llama_decode(ctx, batch)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_eval;
+    }
+    mymodel->n_past = n_past;
+    return true;
+}
+
+bool eval_tokens(void * model, std::vector<llama_token> tokens) {
+    MyModel * mymodel = (MyModel* )model;
+    llama_context * ctx;
+    ctx = mymodel->ctx;
+    gpt_params params = mymodel->params;
+    int n_past = mymodel->n_past;
+    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > params.n_batch) {
+            n_eval = params.n_batch;
+        }
+        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_eval;
+    }
+    mymodel->n_past = n_past;
+    return true;
+}
+
+bool eval_id(struct MyModel* mymodel, int id) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(mymodel, tokens);
+}
+
+bool eval_string(struct MyModel * mymodel,const char* str){
+    llama_context * ctx = mymodel->ctx;
+    std::string str2 = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
+    eval_tokens(mymodel, embd_inp);
+    return true;
+}
+
+llama_token sampling_id(struct MyModel* mymodel) {
+    llama_context* ctx = mymodel->ctx;
+    gpt_params params = mymodel->params;
+    llama_sampling_params & sparams = params.sampling_params;
+    // int n_ctx = llama_n_ctx(ctx);
+
+    // out of user input, sample next token
+    const float   temp            = sparams.temp;
+    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
+    const float   top_p           = sparams.top_p;
+    const float   tfs_z           = sparams.tfs_z;
+    const float   typical_p       = sparams.typical_p;
+    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+    // const float   repeat_penalty  = params.repeat_penalty;
+    // const float   alpha_presence  = params.presence_penalty;
+    // const float   alpha_frequency = params.frequency_penalty;
+    const int     mirostat        = sparams.mirostat;
+    const float   mirostat_tau    = sparams.mirostat_tau;
+    const float   mirostat_eta    = sparams.mirostat_eta;
+    // const bool    penalize_nl     = params.penalize_nl;
+
+    llama_token id = 0;
+    {
+        auto logits  = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+        // Apply params.logit_bias map
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
+            logits[it->first] += it->second;
+        }
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+        // TODO: Apply penalties
+        // float nl_logit = logits[llama_token_nl(ctx)];
+        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+        // llama_sample_repetition_penalty(ctx, &candidates_p,
+        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        //      last_n_repeat, repeat_penalty);
+        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        // last_n_repeat, alpha_frequency, alpha_presence);
+        // if (!penalize_nl) {
+        //     logits[llama_token_nl(ctx)] = nl_logit;
+        // }
+
+        if (temp <= 0) {
+            // Greedy sampling
+            id = llama_sample_token_greedy(ctx, &candidates_p);
+        } else {
+            if (mirostat == 1) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                const int mirostat_m = 100;
+                llama_sample_temp(ctx, &candidates_p, temp);
+                id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+            } else if (mirostat == 2) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                llama_sample_temp(ctx, &candidates_p, temp);
+                id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+            } else {
+                // Temperature sampling
+                llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                llama_sample_temp(ctx, &candidates_p, temp);
+                id = llama_sample_token(ctx, &candidates_p);
+            }
+        }
+    }
+
+    return id;
+}
+
+const char * sampling(struct MyModel * mymodel) {
+    llama_context * ctx = mymodel->ctx;
+    int id = sampling_id(mymodel);
+    static std::string ret;
+    if (id == llama_token_eos(ctx)) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx, id);
+    }
+    eval_id(mymodel, id);
+    return ret.c_str();
+}
+
+}
@@ -0,0 +1,35 @@
+#include "embd-input.h"
+#include <stdlib.h>
+#include <random>
+#include <string.h>
+
+int main(int argc, char** argv) {
+
+    auto mymodel = create_mymodel(argc, argv);
+    int N = 10;
+    int max_tgt_len = 500;
+    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
+
+    // add random float embd to test evaluation
+    float * data = new float[N*n_embd];
+    std::default_random_engine e;
+    std::uniform_real_distribution<float>  u(0,1);
+    for (int i=0;i<N*n_embd;i++) {
+        data[i] = u(e);
+    }
+
+    eval_string(mymodel, "user: what is the color of the flag of UN?");
+    eval_float(mymodel, data, N);
+    eval_string(mymodel, "assistant:");
+    eval_string(mymodel, mymodel->params.prompt.c_str());
+    const char* tmp;
+    for (int i=0; i<max_tgt_len; i++) {
+        tmp = sampling(mymodel);
+        if (strcmp(tmp, "</s>")==0) break;
+        printf("%s", tmp);
+        fflush(stdout);
+    }
+    printf("\n");
+    free_mymodel(mymodel);
+    return 0;
+}
@@ -0,0 +1,27 @@
+#ifndef _EMBD_INPUT_H_
+#define _EMBD_INPUT_H_ 1
+
+#include "common.h"
+#include "llama.h"
+
+extern "C" {
+
+typedef struct MyModel {
+    llama_context* ctx;
+    gpt_params params;
+    int n_past = 0;
+} MyModel;
+
+struct MyModel* create_mymodel(int argc, char ** argv);
+
+bool eval_float(void* model, float* input, int N);
+bool eval_tokens(void* model, std::vector<llama_token> tokens);
+bool eval_id(struct MyModel* mymodel, int id);
+bool eval_string(struct MyModel* mymodel, const char* str);
+const char * sampling(struct MyModel* mymodel);
+llama_token sampling_id(struct MyModel* mymodel);
+void free_mymodel(struct MyModel* mymodel);
+
+}
+
+#endif
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+import ctypes
+from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
+import numpy as np
+import os
+
+libc = cdll.LoadLibrary("./libembdinput.so")
+libc.sampling.restype=c_char_p
+libc.create_mymodel.restype=c_void_p
+libc.eval_string.argtypes=[c_void_p, c_char_p]
+libc.sampling.argtypes=[c_void_p]
+libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
+
+
+class MyModel:
+    def __init__(self, args):
+        argc = len(args)
+        c_str = [c_char_p(i.encode()) for i in args]
+        args_c = (c_char_p * argc)(*c_str)
+        self.model = c_void_p(libc.create_mymodel(argc, args_c))
+        self.max_tgt_len = 512
+        self.print_string_eval = True
+
+    def __del__(self):
+        libc.free_mymodel(self.model)
+
+    def eval_float(self, x):
+        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
+
+    def eval_string(self, x):
+        libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
+        if self.print_string_eval:
+            print(x)
+
+    def eval_token(self, x):
+        libc.eval_id(self.model, x)
+
+    def sampling(self):
+        s = libc.sampling(self.model)
+        return s
+
+    def stream_generate(self, end="</s>"):
+        ret = b""
+        end = end.encode()
+        for _ in range(self.max_tgt_len):
+            tmp = self.sampling()
+            ret += tmp
+            yield tmp
+            if ret.endswith(end):
+                break
+
+    def generate_with_print(self, end="</s>"):
+        ret = b""
+        for i in self.stream_generate(end=end):
+            ret += i
+            print(i.decode(errors="replace"), end="", flush=True)
+        print("")
+        return ret.decode(errors="replace")
+
+
+    def generate(self, end="</s>"):
+        text = b"".join(self.stream_generate(end=end))
+        return text.decode(errors="replace")
+
+if __name__ == "__main__":
+    model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
+    model.eval_string("""user: what is the color of the flag of UN?""")
+    x = np.random.random((5120,10))# , dtype=np.float32)
+    model.eval_float(x)
+    model.eval_string("""assistant:""")
+    for i in model.generate():
+        print(i.decode(errors="replace"), end="", flush=True)
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+from embd_input import MyModel
+import numpy as np
+from torch import nn
+import torch
+from transformers import CLIPVisionModel,  CLIPImageProcessor
+from PIL import Image
+
+# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
+vision_tower = "openai/clip-vit-large-patch14"
+select_hidden_state_layer = -2
+# (vision_config.image_size // vision_config.patch_size) ** 2
+image_token_len = (224//14)**2
+
+class Llava:
+    def __init__(self, args):
+        self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
+        self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
+        self.mm_projector = nn.Linear(1024, 5120)
+        self.model = MyModel(["main", *args])
+
+    def load_projection(self, path):
+        state = torch.load(path)
+        self.mm_projector.load_state_dict({
+            "weight": state["model.mm_projector.weight"],
+            "bias": state["model.mm_projector.bias"]})
+
+    def chat(self, question):
+        self.model.eval_string("user: ")
+        self.model.eval_string(question)
+        self.model.eval_string("\nassistant: ")
+        return self.model.generate_with_print()
+
+    def chat_with_image(self, image, question):
+        with torch.no_grad():
+            embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
+            select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
+            image_feature = select_hidden_state[:, 1:]
+            embd_image = self.mm_projector(image_feature)
+            embd_image = embd_image.cpu().numpy()[0]
+        self.model.eval_string("user: ")
+        self.model.eval_token(32003-2) # im_start
+        self.model.eval_float(embd_image.T)
+        for i in range(image_token_len-embd_image.shape[0]):
+            self.model.eval_token(32003-3) # im_patch
+        self.model.eval_token(32003-1) # im_end
+        self.model.eval_string(question)
+        self.model.eval_string("\nassistant: ")
+        return self.model.generate_with_print()
+
+
+if __name__=="__main__":
+    # model form liuhaotian/LLaVA-13b-delta-v1-1
+    a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
+    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
+    # Also here can use pytorch_model-00003-of-00003.bin directly.
+    a.load_projection(os.path.join(
+        os.path.dirname(__file__) ,
+        "llava_projection.pth"))
+    respose = a.chat_with_image(
+        Image.open("./media/llama1-logo.png").convert('RGB'),
+        "what is the text in the picture?")
+    respose
+    a.chat("what is the color of it?")
+
+
+
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+from embd_input import MyModel
+import numpy as np
+from torch import nn
+import torch
+from PIL import Image
+
+minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
+sys.path.insert(0, minigpt4_path)
+from minigpt4.models.blip2 import Blip2Base
+from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
+
+
+class MiniGPT4(Blip2Base):
+    """
+    MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
+    """
+    def __init__(self,
+        args,
+        vit_model="eva_clip_g",
+        q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
+        img_size=224,
+        drop_path_rate=0,
+        use_grad_checkpoint=False,
+        vit_precision="fp32",
+        freeze_vit=True,
+        freeze_qformer=True,
+        num_query_token=32,
+        llama_model="",
+        prompt_path="",
+        prompt_template="",
+        max_txt_len=32,
+        end_sym='\n',
+        low_resource=False,  # use 8 bit and put vit in cpu
+        device_8bit=0
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.low_resource = low_resource
+        self.preprocessor = Blip2ImageEvalProcessor(img_size)
+
+        print('Loading VIT')
+        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
+            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
+        )
+        print('Loading VIT Done')
+        print('Loading Q-Former')
+        self.Qformer, self.query_tokens = self.init_Qformer(
+            num_query_token, self.visual_encoder.num_features
+        )
+        self.Qformer.cls = None
+        self.Qformer.bert.embeddings.word_embeddings = None
+        self.Qformer.bert.embeddings.position_embeddings = None
+        for layer in self.Qformer.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+        self.load_from_pretrained(url_or_filename=q_former_model)
+        print('Loading Q-Former Done')
+        self.llama_proj = nn.Linear(
+            self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
+        )
+        self.max_txt_len = max_txt_len
+        self.end_sym = end_sym
+        self.model = MyModel(["main", *args])
+        # system prompt
+        self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
+           "You will be able to see the image once I provide it to you. Please answer my questions."
+           "###")
+
+    def encode_img(self, image):
+        image = self.preprocessor(image)
+        image = image.unsqueeze(0)
+        device = image.device
+        if self.low_resource:
+            self.vit_to_cpu()
+            image = image.to("cpu")
+
+        with self.maybe_autocast():
+            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
+            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
+
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+
+            inputs_llama = self.llama_proj(query_output.last_hidden_state)
+            # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
+        return inputs_llama
+
+    def load_projection(self, path):
+        state = torch.load(path)["model"]
+        self.llama_proj.load_state_dict({
+            "weight": state["llama_proj.weight"],
+            "bias": state["llama_proj.bias"]})
+
+    def chat(self, question):
+        self.model.eval_string("Human: ")
+        self.model.eval_string(question)
+        self.model.eval_string("\n### Assistant:")
+        return self.model.generate_with_print(end="###")
+
+    def chat_with_image(self, image, question):
+        with torch.no_grad():
+            embd_image = self.encode_img(image)
+        embd_image = embd_image.cpu().numpy()[0]
+        self.model.eval_string("Human: <Img>")
+        self.model.eval_float(embd_image.T)
+        self.model.eval_string("</Img> ")
+        self.model.eval_string(question)
+        self.model.eval_string("\n### Assistant:")
+        return self.model.generate_with_print(end="###")
+
+
+if __name__=="__main__":
+    a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
+    a.load_projection(os.path.join(
+        os.path.dirname(__file__) ,
+        "pretrained_minigpt4.pth"))
+    respose = a.chat_with_image(
+        Image.open("./media/llama1-logo.png").convert('RGB'),
+        "what is the text in the picture?")
+    a.chat("what is the color of it?")
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+from embd_input import MyModel
+import numpy as np
+from torch import nn
+import torch
+
+# use PandaGPT path
+panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
+imagebind_ckpt_path = "./models/panda_gpt/"
+
+sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
+from ImageBind.models import imagebind_model
+from ImageBind import data
+
+ModalityType = imagebind_model.ModalityType
+max_tgt_len = 400
+
+class PandaGPT:
+    def __init__(self, args):
+        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
+        self.visual_encoder.eval()
+        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
+        self.max_tgt_len = max_tgt_len
+        self.model = MyModel(["main", *args])
+        self.generated_text = ""
+        self.device = "cpu"
+
+    def load_projection(self, path):
+        state = torch.load(path, map_location="cpu")
+        self.llama_proj.load_state_dict({
+            "weight": state["llama_proj.weight"],
+            "bias": state["llama_proj.bias"]})
+
+    def eval_inputs(self, inputs):
+        self.model.eval_string("<Img>")
+        embds = self.extract_multimoal_feature(inputs)
+        for i in embds:
+            self.model.eval_float(i.T)
+        self.model.eval_string("</Img> ")
+
+    def chat(self, question):
+        return self.chat_with_image(None, question)
+
+    def chat_with_image(self, inputs, question):
+        if self.generated_text == "":
+            self.model.eval_string("###")
+        self.model.eval_string(" Human: ")
+        if inputs:
+            self.eval_inputs(inputs)
+        self.model.eval_string(question)
+        self.model.eval_string("\n### Assistant:")
+        ret = self.model.generate_with_print(end="###")
+        self.generated_text += ret
+        return ret
+
+    def extract_multimoal_feature(self, inputs):
+        features = []
+        for key in ["image", "audio", "video", "thermal"]:
+            if key + "_paths" in inputs:
+                embeds = self.encode_data(key, inputs[key+"_paths"])
+                features.append(embeds)
+        return features
+
+    def encode_data(self, data_type, data_paths):
+
+        type_map = {
+            "image": ModalityType.VISION,
+            "audio": ModalityType.AUDIO,
+            "video": ModalityType.VISION,
+            "thermal": ModalityType.THERMAL,
+        }
+        load_map = {
+            "image": data.load_and_transform_vision_data,
+            "audio": data.load_and_transform_audio_data,
+            "video": data.load_and_transform_video_data,
+            "thermal": data.load_and_transform_thermal_data
+        }
+
+        load_function = load_map[data_type]
+        key = type_map[data_type]
+
+        inputs = {key: load_function(data_paths, self.device)}
+        with torch.no_grad():
+            embeddings = self.visual_encoder(inputs)
+            embeds = embeddings[key]
+            embeds = self.llama_proj(embeds).cpu().numpy()
+        return embeds
+
+
+if __name__=="__main__":
+    a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
+    a.load_projection("./models/panda_gpt/adapter_model.bin")
+    a.chat_with_image(
+        {"image_paths": ["./media/llama1-logo.png"]},
+        "what is the text in the picture? 'llama' or 'lambda'?")
+    a.chat("what is the color of it?")
@@ -4,5 +4,5 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
-    add_dependencies(${TARGET} BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
 endif()
@@ -39,9 +39,9 @@ static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
-
 static bool is_interacting = false;

+
 static void write_logfile(
    const llama_context * ctx, const gpt_params & params, const llama_model * model,
    const std::vector<llama_token> & input_tokens, const std::string & output,
@@ -104,7 +104,7 @@ static void sigint_handler(int signo) {

 int main(int argc, char ** argv) {
    gpt_params params;
-    llama_sampling_params & sparams = params.sparams;
+    llama_sampling_params & sparams = params.sampling_params;
    g_params = &params;

    if (!gpt_params_parse(argc, argv, params)) {
@@ -358,10 +358,36 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

+    struct llama_grammar * grammar = NULL;
+    grammar_parser::parse_state parsed_grammar;
+
+    if (!params.grammar.empty()) {
+        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        // will be empty (default) if there are parse errors
+        if (parsed_grammar.rules.empty()) {
+            return 1;
+        }
+        LOG_TEE("%s: grammar:\n", __func__);
+        grammar_parser::print_grammar(stderr, parsed_grammar);
+        LOG_TEE("\n");
+
+        {
+            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
+            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
+                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            }
+        }
+
+        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+        grammar = llama_grammar_init(
+            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    }
+
    LOG_TEE("\n#####  Infill mode  #####\n\n");
    if (params.infill) {
        printf("\n************\n");
@@ -404,7 +430,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);

    while (n_remain != 0 || params.interactive) {
        // predict
@@ -523,7 +549,7 @@ int main(int argc, char ** argv) {

            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            llama_sampling_accept(ctx_sampling, ctx, id);

            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

@@ -541,11 +567,8 @@ int main(int argc, char ** argv) {
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-
-                // push the prompt in the sampling context in order to apply repetition penalties later
-                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
-
+                ctx_sampling->prev.erase(ctx_sampling->prev.begin());
+                ctx_sampling->prev.push_back(embd_inp[n_consumed]);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@@ -577,7 +600,7 @@ int main(int argc, char ** argv) {
        if ((int) embd_inp.size() <= n_consumed) {

            // deal with eot token in infill mode
-            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(ctx) || is_interacting) && params.interactive){
+            if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
                if(is_interacting && !params.interactive_first) {
                    // print an eot token
                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
@@ -594,7 +617,7 @@ int main(int argc, char ** argv) {
                    buffer += line;
                } while (another_line);
                // check if we got an empty line, if so we use the old input
-                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                    params.input_prefix = buffer;
                }
                buffer.clear();
@@ -604,7 +627,7 @@ int main(int argc, char ** argv) {
                    buffer += line;
                } while (another_line);
                // check if we got an empty line
-                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                    params.input_suffix = buffer;
                }
                buffer.clear();
@@ -617,7 +640,7 @@ int main(int argc, char ** argv) {
                    process_escapes(params.input_suffix);
                }
                suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
+                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
                    params.input_suffix.erase(0, 1);
                    suff_rm_leading_spc = false;
                }
@@ -644,7 +667,7 @@ int main(int argc, char ** argv) {
                is_interacting = false;
            }
            // deal with end of text token in interactive mode
-            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
+            else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -717,7 +740,15 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    // reset grammar state if we're restarting generation
+                    if (grammar != NULL) {
+                        llama_grammar_free(grammar);
+
+                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+                        grammar = llama_grammar_init(
+                            grammar_rules.data(), grammar_rules.size(),
+                            parsed_grammar.symbol_ids.at("root"));
+                    }
                }
                is_interacting = false;
            }
@@ -747,7 +778,9 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);

-    llama_sampling_free(ctx_sampling);
+    if (grammar != NULL) {
+        llama_grammar_free(grammar);
+    }
    llama_backend_free();

 #ifndef LOG_DISABLE_LOGS
@@ -112,7 +112,8 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
 static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
    if (!cur) {
-        throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
+        printf("unable to find tensor %s\n", name.c_str());
+        throw std::runtime_error(format("unable to find tensor %s\n", name.c_str()));
    }

    return cur;
@@ -135,7 +136,7 @@ static std::string get_ftype(int ftype) {
    case 8:
        return "q8_0";
    default:
-        throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
+        throw std::runtime_error(format("Unrecognized file type: %d\n", ftype));
    }
 }

@@ -461,9 +462,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    };

    struct gguf_context * ctx = gguf_init_from_file(fname, params);
-    if (!ctx) {
-        throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
-    }

    if (verbosity >= 1) {
        const int n_tensors = gguf_get_n_tensors(ctx);
@@ -16,29 +16,13 @@ checkpoint = torch.load(path)
 mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]

 # store these tensors in a new dictionary and torch.save them
-projector = {name: checkpoint[name].float() for name in mm_tensors}
+projector = {name: checkpoint[name] for name in mm_tensors}
 torch.save(projector, f"{args.model}/llava.projector")

 # remove these tensors from the checkpoint and save it again
 for name in mm_tensors:
    del checkpoint[name]

-# BakLLaVA models contain CLIP tensors in it
-clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
-if len(clip_tensors) > 0:
-    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
-    torch.save(clip, f"{args.model}/llava.clip")
-
-    # remove these tensors
-    for name in clip_tensors:
-        del checkpoint[name]
-
-    # added tokens should be removed to be able to convert Mistral models
-    if os.path.exists(f"{args.model}/added_tokens.json"):
-        with open(f"{args.model}/added_tokens.json", "w") as f:
-            f.write("{}\n")
-
-
 torch.save(checkpoint, path)

 print("Done!")
@@ -58,30 +58,28 @@ inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n

 // TODO: use common/sampling.h
 inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
-    auto & sparams = params.sparams;
-
-    // out of user input, sample next token
-    const float   temp      = sparams.temp;
-    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
-    const float   top_p     = sparams.top_p;
-    const float   tfs_z     = sparams.tfs_z;
-    const float   typical_p = sparams.typical_p;
-    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
-    // const float   repeat_penalty  = sparams.repeat_penalty;
-    // const float   alpha_presence  = sparams.presence_penalty;
-    // const float   alpha_frequency = sparams.frequency_penalty;
-    const int     mirostat     = sparams.mirostat;
-    const float   mirostat_tau = sparams.mirostat_tau;
-    const float   mirostat_eta = sparams.mirostat_eta;
-    // const bool    penalize_nl     = sparams.penalize_nl;
+      // out of user input, sample next token
+    const float   temp      = params.sampling_params.temp;
+    const int32_t top_k     = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
+    const float   top_p     = params.sampling_params.top_p;
+    const float   tfs_z     = params.sampling_params.tfs_z;
+    const float   typical_p = params.sampling_params.typical_p;
+      // const int32_t repeat_last_n   = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
+      // const float   repeat_penalty  = params.sampling_params.repeat_penalty;
+      // const float   alpha_presence  = params.sampling_params.presence_penalty;
+      // const float   alpha_frequency = params.sampling_params.frequency_penalty;
+    const int     mirostat     = params.sampling_params.mirostat;
+    const float   mirostat_tau = params.sampling_params.mirostat_tau;
+    const float   mirostat_eta = params.sampling_params.mirostat_eta;
+      // const bool    penalize_nl     = params.sampling_params.penalize_nl;

    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx_llama);
        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));

-        // Apply params.logit_bias map
-        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
+          // Apply params.logit_bias map
+        for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }

@@ -93,18 +91,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {

        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

-        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl(ctx)];
-        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-        // llama_sample_repetition_penalty(ctx, &candidates_p,
-        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        //      last_n_repeat, repeat_penalty);
-        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        // last_n_repeat, alpha_frequency, alpha_presence);
-        // if (!penalize_nl) {
-        //     logits[llama_token_nl(ctx)] = nl_logit;
-        // }
+          // TODO: Apply penalties
+          // float nl_logit = logits[llama_token_nl(ctx)];
+          // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+          // llama_sample_repetition_penalty(ctx, &candidates_p,
+          //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+          //      last_n_repeat, repeat_penalty);
+          // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+          // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+          // last_n_repeat, alpha_frequency, alpha_presence);
+          // if (!penalize_nl) {
+          //     logits[llama_token_nl(ctx)] = nl_logit;
+          // }

        if (temp <= 0) {
              // Greedy sampling
@@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }
-    llama_sampling_params & sparams = params.sparams;
+    llama_sampling_params & sparams = params.sampling_params;

 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
@@ -415,7 +415,8 @@ int main(int argc, char ** argv) {
            }
        }
    }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

@@ -458,7 +459,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);

    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@@ -611,7 +612,7 @@ int main(int argc, char ** argv) {

            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            llama_sampling_accept(ctx_sampling, ctx, id);

            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

@@ -630,9 +631,12 @@ int main(int argc, char ** argv) {
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);

-                // push the prompt in the sampling context in order to apply repetition penalties later
-                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+                // GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
+                //     Most likely will remove this in the future to avoid exposing "prev"
+                //     Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
+                //     penalty will be applied only based on the tokens generated by the model.
+                ctx_sampling->prev.erase(ctx_sampling->prev.begin());
+                ctx_sampling->prev.push_back(embd_inp[n_consumed]);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -663,10 +667,12 @@ int main(int argc, char ** argv) {

        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt in the last n_prev tokens
+            // check for reverse prompt
            if (!params.antiprompt.empty()) {
-                const int n_prev = 32;
-                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
+                std::string last_output;
+                for (auto id : ctx_sampling->prev) {
+                    last_output += llama_token_to_piece(ctx, id);
+                }

                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
@@ -693,7 +699,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
+            if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.ctx_sampling = llama_sampling_init(params.sparams);
+        client.ctx_sampling = llama_sampling_init(params);
    }

    std::vector<llama_token> tokens_system;
@@ -330,7 +330,7 @@ int main(int argc, char ** argv) {

                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);

-                llama_sampling_accept(client.ctx_sampling, ctx, id, true);
+                llama_sampling_accept(client.ctx_sampling, ctx, id);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@@ -195,12 +195,10 @@ struct llama_server_context
    json prompt;
    std::vector<llama_token> embd;

-    gpt_params params;
-
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
-    llama_sampling_context *ctx_sampling = nullptr;
-
+    gpt_params params;
+    llama_sampling_context *ctx_sampling;
    int n_ctx;

    bool truncated = false;
@@ -234,7 +232,7 @@ struct llama_server_context
    void rewind()
    {
        params.antiprompt.clear();
-        params.sparams.grammar.clear();
+        params.grammar.clear();
        num_prompt_tokens = 0;
        num_tokens_predicted = 0;
        generated_text = "";
@@ -248,14 +246,11 @@ struct llama_server_context
        multibyte_pending = 0;
        n_remain = 0;
        n_past = 0;
-        params.sparams.n_prev = n_ctx;
-    }

-    void initSampling() {
        if (ctx_sampling != nullptr) {
            llama_sampling_free(ctx_sampling);
        }
-        ctx_sampling = llama_sampling_init(params.sparams);
+        ctx_sampling = llama_sampling_init(params);
    }

    bool loadModel(const gpt_params &params_)
@@ -316,32 +311,16 @@ struct llama_server_context
        return prompt_tokens;
    }

-    void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
-        const int n_left = n_ctx - params.n_keep;
-        const int n_block_size = n_left / 2;
-        const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
-
-        // Keep n_keep tokens at start of prompt (at most n_ctx - 4)
-        std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-
-        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
-
-        LOG_VERBOSE("input truncated", {
-                {"n_ctx", n_ctx},
-                {"n_keep", params.n_keep},
-                {"n_left", n_left},
-                {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                {"num_prompt_tokens", new_tokens.size()}
-        });
-
-        truncated = true;
-        prompt_tokens = new_tokens;
+    bool loadGrammar()
+    {
+        ctx_sampling = llama_sampling_init(params);
+        return true;
    }

    void loadInfill()
    {
        bool suff_rm_leading_spc = true;
-        if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
+        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
            params.input_suffix.erase(0, 1);
            suff_rm_leading_spc = false;
        }
@@ -357,7 +336,6 @@ struct llama_server_context
        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
        prefix_tokens.push_back(llama_token_middle(ctx));
-
        auto prompt_tokens = prefix_tokens;

        num_prompt_tokens = prompt_tokens.size();
@@ -369,18 +347,31 @@ struct llama_server_context
        params.n_keep = std::min(params.n_ctx - 4, params.n_keep);

        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t) n_ctx)
+        if (num_prompt_tokens >= (size_t)params.n_ctx)
        {
-            truncatePrompt(prompt_tokens);
-            num_prompt_tokens = prompt_tokens.size();
+            printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
+            // todo we probably want to cut from both sides
+            const int n_left = (params.n_ctx - params.n_keep) / 2;
+            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
+            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
+            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());

-            GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
+            LOG_VERBOSE("input truncated", {
+                                               {"n_ctx", params.n_ctx},
+                                               {"n_keep", params.n_keep},
+                                               {"n_left", n_left},
+                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                                           });
+
+            truncated = true;
+            prompt_tokens = new_tokens;
        }
-
-        // push the prompt into the sampling context (do not apply grammar)
-        for (auto & token : prompt_tokens)
+        else
        {
-            llama_sampling_accept(ctx_sampling, ctx, token, false);
+            const size_t ps = num_prompt_tokens;
+            std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
        }

        // compare the evaluated prompt with the new prompt
@@ -418,18 +409,29 @@ struct llama_server_context
        params.n_keep = std::min(n_ctx - 4, params.n_keep);

        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t) n_ctx)
+        if (num_prompt_tokens >= (size_t)n_ctx)
        {
-            truncatePrompt(prompt_tokens);
-            num_prompt_tokens = prompt_tokens.size();
+            const int n_left = (n_ctx - params.n_keep) / 2;
+            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
+            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
+            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());

-            GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
+            LOG_VERBOSE("input truncated", {
+                                               {"n_ctx", n_ctx},
+                                               {"n_keep", params.n_keep},
+                                               {"n_left", n_left},
+                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                                           });
+
+            truncated = true;
+            prompt_tokens = new_tokens;
        }
-
-        // push the prompt into the sampling context (do not apply grammar)
-        for (auto & token : prompt_tokens)
+        else
        {
-            llama_sampling_accept(ctx_sampling, ctx, token, false);
+            const size_t ps = num_prompt_tokens;
+            std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
        }

        // compare the evaluated prompt with the new prompt
@@ -528,8 +530,8 @@ struct llama_server_context

            llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };

-            const int32_t n_probs = params.sparams.n_probs;
-            if (params.sparams.temp <= 0 && n_probs > 0)
+            const int32_t n_probs = params.sampling_params.n_probs;
+            if (params.sampling_params.temp <= 0 && n_probs > 0)
            {
                // For llama_sample_token_greedy we need to sort candidates
                llama_sample_softmax(ctx, &cur_p);
@@ -540,7 +542,7 @@ struct llama_server_context
                result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
            }

-            llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
+            llama_sampling_accept(ctx_sampling, ctx, result.tok);

            if (tg) {
                num_tokens_predicted++;
@@ -604,7 +606,7 @@ struct llama_server_context
        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;

-        if (params.sparams.n_probs > 0)
+        if (params.sampling_params.n_probs > 0)
        {
            generated_token_probs.push_back(token_with_probs);
        }
@@ -1002,36 +1004,36 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,

 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto & sparams = llama.params.sparams;
+    const auto & sparams = llama.params.sampling_params;
    const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
    const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);

    return json{
-        {"n_ctx",             llama.n_ctx},
-        {"model",             llama.params.model_alias},
-        {"seed",              llama.params.seed},
-        {"temp",              sparams.temp},
-        {"top_k",             sparams.top_k},
-        {"top_p",             sparams.top_p},
-        {"tfs_z",             sparams.tfs_z},
-        {"typical_p",         sparams.typical_p},
-        {"repeat_last_n",     sparams.penalty_last_n},
-        {"repeat_penalty",    sparams.penalty_repeat},
-        {"frequency_penalty", sparams.penalty_freq},
-        {"presence_penalty",  sparams.penalty_present},
-        {"mirostat",          sparams.mirostat},
-        {"mirostat_tau",      sparams.mirostat_tau},
-        {"mirostat_eta",      sparams.mirostat_eta},
-        {"penalize_nl",       sparams.penalize_nl},
-        {"stop",              llama.params.antiprompt},
-        {"n_predict",         llama.params.n_predict},
-        {"n_keep",            llama.params.n_keep},
-        {"ignore_eos",        ignore_eos},
-        {"stream",            llama.stream},
-        {"logit_bias",        sparams.logit_bias},
-        {"n_probs",           sparams.n_probs},
-        {"grammar",           llama.params.sparams.grammar},
+        {"n_ctx", llama.n_ctx},
+        {"model", llama.params.model_alias},
+        {"seed", llama.params.seed},
+        {"temp", sparams.temp},
+        {"top_k", sparams.top_k},
+        {"top_p", sparams.top_p},
+        {"tfs_z", sparams.tfs_z},
+        {"typical_p", sparams.typical_p},
+        {"repeat_last_n", sparams.repeat_last_n},
+        {"repeat_penalty", sparams.repeat_penalty},
+        {"presence_penalty", sparams.presence_penalty},
+        {"frequency_penalty", sparams.frequency_penalty},
+        {"mirostat", sparams.mirostat},
+        {"mirostat_tau", sparams.mirostat_tau},
+        {"mirostat_eta", sparams.mirostat_eta},
+        {"penalize_nl", sparams.penalize_nl},
+        {"stop", llama.params.antiprompt},
+        {"n_predict", llama.params.n_predict},
+        {"n_keep", llama.params.n_keep},
+        {"ignore_eos", ignore_eos},
+        {"stream", llama.stream},
+        {"logit_bias", sparams.logit_bias},
+        {"n_probs", sparams.n_probs},
+        {"grammar", llama.params.grammar},
    };
 }

@@ -1079,7 +1081,7 @@ static json format_final_response(llama_server_context &llama, const std::string
        {"timings", format_timings(llama)},
    };

-    if (llama.params.sparams.n_probs > 0)
+    if (llama.params.sampling_params.n_probs > 0)
    {
        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
    }
@@ -1095,7 +1097,7 @@ static json format_partial_response(
        {"stop", false},
    };

-    if (llama.params.sparams.n_probs > 0)
+    if (llama.params.sampling_params.n_probs > 0)
    {
        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
    }
@@ -1127,30 +1129,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
    gpt_params default_params;
-    const auto & default_sparams = default_params.sparams;
+    const auto & default_sparams = default_params.sampling_params;
+    auto & sparams = llama.params.sampling_params;

-    auto & params  = llama.params;
-    auto & sparams = llama.params.sparams;
-
-    llama.stream            = json_value(body, "stream",            false);
-    params.n_predict        = json_value(body, "n_predict",         default_params.n_predict);
-    sparams.top_k           = json_value(body, "top_k",             default_sparams.top_k);
-    sparams.top_p           = json_value(body, "top_p",             default_sparams.top_p);
-    sparams.tfs_z           = json_value(body, "tfs_z",             default_sparams.tfs_z);
-    sparams.typical_p       = json_value(body, "typical_p",         default_sparams.typical_p);
-    sparams.temp            = json_value(body, "temperature",       default_sparams.temp);
-    sparams.penalty_last_n  = json_value(body, "repeat_last_n",     default_sparams.penalty_last_n);
-    sparams.penalty_repeat  = json_value(body, "repeat_penalty",    default_sparams.penalty_repeat);
-    sparams.penalty_freq    = json_value(body, "frequency_penalty", default_sparams.penalty_freq);
-    sparams.penalty_present = json_value(body, "presence_penalty",  default_sparams.penalty_present);
-    sparams.mirostat        = json_value(body, "mirostat",          default_sparams.mirostat);
-    sparams.mirostat_tau    = json_value(body, "mirostat_tau",      default_sparams.mirostat_tau);
-    sparams.mirostat_eta    = json_value(body, "mirostat_eta",      default_sparams.mirostat_eta);
-    sparams.penalize_nl     = json_value(body, "penalize_nl",       default_sparams.penalize_nl);
-    params.n_keep           = json_value(body, "n_keep",            default_params.n_keep);
-    params.seed             = json_value(body, "seed",              default_params.seed);
-    sparams.grammar         = json_value(body, "grammar",           default_sparams.grammar);
-    sparams.n_probs         = json_value(body, "n_probs",           default_sparams.n_probs);
+    llama.stream = json_value(body, "stream", false);
+    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
+    sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
+    sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
+    sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
+    sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
+    sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
+    sparams.temp = json_value(body, "temperature", default_sparams.temp);
+    sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
+    sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
+    sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
+    sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
+    sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
+    sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
+    sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
+    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
+    llama.params.seed = json_value(body, "seed", default_params.seed);
+    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
+    sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);

    if (body.count("prompt") != 0)
    {
@@ -1204,6 +1204,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla
        }
    }

+    llama.ctx_sampling = llama_sampling_init(llama.params);
+
    LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }

@@ -1372,9 +1374,15 @@ int main(int argc, char **argv)
        llama.rewind();

        llama_reset_timings(llama.ctx);
+
        parse_options_completion(json::parse(req.body), llama);

-        llama.initSampling();
+        if (!llama.loadGrammar())
+        {
+            res.status = 400;
+            return;
+        }
+
        llama.loadPrompt();
        llama.beginCompletion();

@@ -1406,7 +1414,7 @@ int main(int argc, char **argv)
            }

            auto probs = llama.generated_token_probs;
-            if (llama.params.sparams.n_probs > 0 && llama.stopped_word) {
+            if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
                const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
                probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
            }
@@ -1458,7 +1466,7 @@ int main(int argc, char **argv)

                        std::vector<completion_token_output> probs_output = {};

-                        if (llama.params.sparams.n_probs > 0) {
+                        if (llama.params.sampling_params.n_probs > 0) {
                            const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                            size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                            size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1529,9 +1537,14 @@ int main(int argc, char **argv)
        llama.rewind();

        llama_reset_timings(llama.ctx);
+
        parse_options_infill(json::parse(req.body), llama);

-        llama.initSampling();
+        if (!llama.loadGrammar())
+        {
+            res.status = 400;
+            return;
+        }
        llama.loadInfill();
        llama.beginCompletion();
        const auto chunked_content_provider = [&](size_t, DataSink & sink) {
@@ -1574,7 +1587,7 @@ int main(int argc, char **argv)

                    std::vector<completion_token_output> probs_output = {};

-                    if (llama.params.sparams.n_probs > 0) {
+                    if (llama.params.sampling_params.n_probs > 0) {
                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1681,9 +1694,7 @@ int main(int argc, char **argv)
        const json body = json::parse(req.body);

        llama.rewind();
-
        llama_reset_timings(llama.ctx);
-
        if (body.count("content") != 0)
        {
            llama.prompt = body["content"];
@@ -1693,8 +1704,6 @@ int main(int argc, char **argv)
            llama.prompt = "";
        }
        llama.params.n_predict = 0;
-
-        llama.initSampling();
        llama.loadPrompt();
        llama.beginCompletion();
        llama.doCompletion();
@@ -37,8 +37,8 @@ int main(int argc, char ** argv) {
    const int n_seq_dft = params.n_parallel;

    // TODO: make this configurable
-    const float p_accept = 0.80f;
-    const float p_split  = 0.10f;
+    const float p_accept = 0.4f;
+    const float p_split  = 0.3f;

 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
@@ -112,16 +112,16 @@ int main(int argc, char ** argv) {
    bool has_eos = false;

    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);

    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);

-    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    params.sparams.temp = std::max(0.01f, params.sparams.temp);
+    params.grammar.clear();             // the draft samplers will copy the target sampler's grammar
+    params.sampling_params.temp = 1.0f; // the draft samplers use default temperature

    for (int s = 0; s < n_seq_dft; ++s) {
-        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+        drafts[s].ctx_sampling = llama_sampling_init(params);
    }

    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@@ -154,9 +154,9 @@ int main(int argc, char ** argv) {
            // sample from the target model
            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);

-            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
+            llama_sampling_accept(ctx_sampling, ctx_tgt, id);

-            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
+            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));

            const std::string token_str = llama_token_to_piece(ctx_tgt, id);

@@ -202,7 +202,7 @@ int main(int argc, char ** argv) {

            // TODO: simplify
            {
-                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+                LOG("keeping sequence %d\n", s_keep);

                llama_kv_cache_seq_keep(ctx_dft, s_keep);
                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -277,7 +277,7 @@ int main(int argc, char ** argv) {
                }

                if (cur_p[0].p < p_accept) {
-                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
+                    LOG("stopping drafting for seq %3d, probability too low: %.3f < 2*%.3f\n", s, cur_p[0].p, cur_p[1].p);
                    drafts[s].drafting = false;
                    continue;
                }
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {

                    const int s = sa[is];

-                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
+                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id);

                    drafts[s].tokens.push_back(id);

@@ -337,14 +337,16 @@ int main(int argc, char ** argv) {

                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);

+                    // no need to evaluate the last drafted token, since we won't use the result
+                    if (batch_tgt.n_tokens > n_draft) {
+                        drafts[s].drafting = false;
+                        continue;
+                    }
+
                    // add the token to the batch for batched decoding with the draft model
                    drafts[s].i_batch_dft = batch_dft.n_tokens;

                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
-
-                    if (batch_tgt.n_tokens > n_draft) {
-                        drafts[s].drafting = false;
-                    }
                }
            }

@@ -363,6 +365,11 @@ int main(int argc, char ** argv) {
            }
        }

+        // account for the last drafted token that we didn't evaluate
+        if (batch_tgt.n_tokens > n_draft) {
+            ++n_drafted;
+        }
+
        // evaluate the target model on the drafted tokens
        {
            llama_kv_cache_seq_keep(ctx_tgt, 0);
@@ -73,8 +73,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_f16);
    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
-    GGML_METAL_DECL_KERNEL(get_rows_q5_0);
-    GGML_METAL_DECL_KERNEL(get_rows_q5_1);
    GGML_METAL_DECL_KERNEL(get_rows_q8_0);
    GGML_METAL_DECL_KERNEL(get_rows_q2_K);
    GGML_METAL_DECL_KERNEL(get_rows_q3_K);
@@ -89,8 +87,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
    GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
@@ -101,8 +97,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
@@ -260,8 +254,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_f16);
        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
-        GGML_METAL_ADD_KERNEL(get_rows_q5_0);
-        GGML_METAL_ADD_KERNEL(get_rows_q5_1);
        GGML_METAL_ADD_KERNEL(get_rows_q8_0);
        GGML_METAL_ADD_KERNEL(get_rows_q2_K);
        GGML_METAL_ADD_KERNEL(get_rows_q3_K);
@@ -276,8 +268,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
        GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
@@ -288,10 +278,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
@@ -358,8 +346,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_f16);
    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
-    GGML_METAL_DEL_KERNEL(get_rows_q5_0);
-    GGML_METAL_DEL_KERNEL(get_rows_q5_1);
    GGML_METAL_DEL_KERNEL(get_rows_q8_0);
    GGML_METAL_DEL_KERNEL(get_rows_q2_K);
    GGML_METAL_DEL_KERNEL(get_rows_q3_K);
@@ -374,8 +360,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
    GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
@@ -386,10 +370,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
@@ -1070,8 +1052,6 @@ void ggml_metal_graph_compute(
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
                                    case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
                                    case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
-                                    case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break;
-                                    case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break;
                                    case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
                                    case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
                                    case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
@@ -1141,24 +1121,6 @@ void ggml_metal_graph_compute(
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
                                        } break;
-                                    case GGML_TYPE_Q5_0:
-                                        {
-                                            GGML_ASSERT(ne02 == 1);
-                                            GGML_ASSERT(ne12 == 1);
-
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
-                                        } break;
-                                    case GGML_TYPE_Q5_1:
-                                        {
-                                            GGML_ASSERT(ne02 == 1);
-                                            GGML_ASSERT(ne12 == 1);
-
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
-                                        } break;
                                    case GGML_TYPE_Q8_0:
                                        {
                                            GGML_ASSERT(ne02 == 1);
@@ -1239,8 +1201,7 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];

-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
-                                    src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
@@ -1272,8 +1233,6 @@ void ggml_metal_graph_compute(
                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
-                                case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break;
-                                case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break;
                                case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
                                case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
                                case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
@@ -18,21 +18,6 @@ typedef struct {
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;

-#define QK5_0 32
-typedef struct {
-    half d;                // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-#define QK5_1 32
-typedef struct {
-    half d;                 // delta
-    half m;                 // min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-
 #define QK8_0 32
 typedef struct {
    half    d;         // delta
@@ -414,11 +399,8 @@ kernel void kernel_rms_norm(
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
-
    float2 acc = 0.f;
-
    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
-
    for (int i = 0; i < 8; i+=2) {
        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
                + yl[i + 1] * (qs[i / 2] & 0x0F00);
@@ -435,11 +417,8 @@ inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thre
 inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
    float m = qb_curr->m;
-
-    float2 acc = 0.f;
-
    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
-
+    float2 acc = 0.f;
    for (int i = 0; i < 8; i+=2) {
        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
                + yl[i + 1] * (qs[i / 2] & 0x0F00);
@@ -449,49 +428,6 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
    return d * (acc[0] + acc[1]) + sumy * m;
 }

-// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q5 quants begin (0 or QK5_0/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
-           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
-                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
-        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
-                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
-    }
-    return d * (sumy * -16.f + acc[0] + acc[1]);
-}
-
-// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q5 quants begin (0 or QK5_1/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-    float m = qb_curr->m;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
-           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
-                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
-        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
-                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
-    }
-    return d * (acc[0] + acc[1]) + sumy * m;
-}
-
 // putting them in the kernel cause a significant performance penalty
 #define N_DST 4        // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2  // number of SIMD groups in a thread group
@@ -589,43 +525,6 @@ kernel void kernel_mul_mv_q4_1_f32(
     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }

-kernel void kernel_mul_mv_q5_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01[[buffer(4)]],
-        constant   int64_t & ne02[[buffer(5)]],
-        constant   int64_t & ne10[[buffer(9)]],
-        constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
-}
-
-kernel void kernel_mul_mv_q5_1_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01[[buffer(4)]],
-        constant   int64_t & ne02[[buffer(5)]],
-        constant   int64_t & ne10[[buffer(9)]],
-        constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
-}
-
-
 #define NB_Q8_0 8

 kernel void kernel_mul_mv_q8_0_f32(
@@ -2250,62 +2149,6 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
    }
 }

-template <typename type4x4>
-void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
-    const float d = xb->d;
-    const float md = -16.h * xb->d;
-    const ushort mask = il ? 0x00F0 : 0x000F;
-
-    const uint32_t qh = *((device const uint32_t *)xb->qh);
-
-    const int x_mv = il ? 4 : 0;
-
-    const int gh_mv = il ? 12 : 0;
-    const int gh_bk = il ?  0 : 4;
-
-    for (int i = 0; i < 8; i++) {
-        // extract the 5-th bits for x0 and x1
-        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
-        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
-
-        // combine the 4-bits from qs with the 5th bit
-        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
-        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
-
-        reg[i/2][2*(i%2)+0] = d * x0 + md;
-        reg[i/2][2*(i%2)+1] = d * x1 + md;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
-    const float d = xb->d;
-    const float m = xb->m;
-    const ushort mask = il ? 0x00F0 : 0x000F;
-
-    const uint32_t qh = *((device const uint32_t *)xb->qh);
-
-    const int x_mv = il ? 4 : 0;
-
-    const int gh_mv = il ? 12 : 0;
-    const int gh_bk = il ?  0 : 4;
-
-    for (int i = 0; i < 8; i++) {
-        // extract the 5-th bits for x0 and x1
-        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
-        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
-
-        // combine the 4-bits from qs with the 5th bit
-        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
-        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
-
-        reg[i/2][2*(i%2)+0] = d * x0 + m;
-        reg[i/2][2*(i%2)+1] = d * x1 + m;
-    }
-}
-
 template <typename type4x4>
 void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
    device const int8_t * qs = ((device const int8_t *)xb->qs);
@@ -2647,8 +2490,6 @@ template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows
 template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
 template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
 template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
-template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
-template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
 template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
 template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
@@ -2677,8 +2518,6 @@ template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mat_mm_t kernel_mul_mm<f
 template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1,     dequantize_f16>;
 template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2,     dequantize_q4_0>;
 template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_0, 2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_1, 2,     dequantize_q5_1>;
 template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2,     dequantize_q8_0>;
 template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
@@ -1395,46 +1395,75 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
+    const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
+    const int64_t nb10 = src1->nb[0];
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    size_t x_size;
    size_t d_size;

-    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
+    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
+    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst


    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            const int i0 = i03*ne02 + i02;
+
            cl_event ev;

            // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));

-            const int64_t i13 = i03%ne13;
-            const int64_t i12 = i02%ne12;
-            const int i1 = i13*ne12*ne11 + i12*ne11;
+            if (nb10 == sizeof(float)) {
+                // Contiguous, avoid overhead from queueing many kernel runs
+                const int64_t i13 = i03%ne13;
+                const int64_t i12 = i02%ne12;
+                const int i1 = i13*ne12*ne11 + i12*ne11;

-            cl_int x_offset = 0;
-            cl_int y_offset = i1*ne10;
-            cl_int d_offset = 0;
+                cl_int x_offset = 0;
+                cl_int y_offset = i1*ne10;
+                cl_int d_offset = 0;

-            size_t global = ne00 * ne01;
-            cl_int ky = ne10 * ne11;
+                size_t global = ne00 * ne01;
+                cl_int ky = ne10;
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+            } else {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const int64_t i13 = i03%ne13;
+                    const int64_t i12 = i02%ne12;
+                    const int64_t i11 = i01%ne11;
+                    const int i1 = i13*ne12*ne11 + i12*ne11 + i11;

-            CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+                    cl_int x_offset = i01*ne00;
+                    cl_int y_offset = i1*ne10;
+                    cl_int d_offset = i01*ne00;
+
+                    // compute
+                    size_t global = ne00;
+                    cl_int ky = ne10;
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
+                    CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+                }
+            }

            CL_CHECK(clReleaseEvent(ev));
            CL_CHECK(clFinish(queue));
@@ -1489,45 +1518,46 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);

    size_t x_offset = 0;
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;

-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_GPU) {
-                    x_offset = (i03 * ne02 + i02) * x_ne;
-                } else {
-                    // copy src0 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                }
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;

-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    // copy src1 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;

-                    CL_CHECK(clFinish(queue));
-
-                    // compute
-                    cl_event ev_sgemm;
-                    clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                               ne01, ne11, ne10,
-                                                               alpha,
-                                                               d_X, x_offset, ne00,
-                                                               d_Y, 0, ne10,
-                                                               beta,
-                                                               d_D, 0, ne01,
-                                                               &queue, &ev_sgemm);
-
-                    if (status != clblast::StatusCode::kSuccess) {
-                        GGML_ASSERT(false);
-                    }
-
-                    // copy dst to host
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
-                }
+            // copy data to device
+            if (src0->backend == GGML_BACKEND_GPU) {
+                x_offset = (i03 * ne02 + i02) * x_ne;
+            } else if (i02 != pi02 || i03 != pi03) {
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                pi02 = i02;
+                pi03 = i03;
            }
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+
+            CL_CHECK(clFinish(queue));
+
+            // compute
+            cl_event ev_sgemm;
+            clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                       ne01, ne11, ne10,
+                                                       alpha,
+                                                       d_X, x_offset, ne00,
+                                                       d_Y, 0, ne10,
+                                                       beta,
+                                                       d_D, 0, ne01,
+                                                       &queue, &ev_sgemm);
+
+            if (status != clblast::StatusCode::kSuccess) {
+                GGML_ASSERT(false);
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
        }
    }

@@ -1588,70 +1618,73 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);

    size_t x_offset = 0;
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;

-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_GPU) {
-                    x_offset = (i03 * ne02 + i02) * x_ne;
-                } else {
-                    // copy src0 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;
+
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;
+
+            // copy src0 to device
+            if (src0->backend == GGML_BACKEND_GPU) {
+                x_offset = (i03 * ne02 + i02) * x_ne;
+            } else if (i02 != pi02 || i03 != pi03) {
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                pi02 = i02;
+                pi03 = i03;
+            }
+
+            // convert src1 to fp16
+            // TODO: use multiple threads
+            char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
+            if (src1_cont_rows) {
+                if (src1_cont_cols) {
+                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
                }
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    // convert src1 to fp16
-                    // TODO: use multiple threads
-                    char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
-                    if (src1_cont_rows) {
-                        if (src1_cont_cols) {
-                            ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
-                        }
-                        else {
-                            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                                ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
-                            }
-                        }
+                else {
+                    for (int64_t i11 = 0; i11 < ne11; i11++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
                    }
-                    else {
-                        for (int64_t i11 = 0; i11 < ne11; i11++) {
-                            for (int64_t i10 = 0; i10 < ne10; i10++) {
-                                // very slow due to no inlining
-                                tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
-                            }
-                        }
-                    }
-
-                    // copy src1 to device
-                    CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
-
-                    CL_CHECK(clFinish(queue));
-
-                    // compute
-                    cl_event ev_sgemm;
-                    clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
-                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                               ne01, ne11, ne10,
-                                                               alpha,
-                                                               d_X, x_offset, ne00,
-                                                               d_Y, 0, ne10,
-                                                               beta,
-                                                               d_D, 0, ne01,
-                                                               &queue, &ev_sgemm);
-
-                    if (status != clblast::StatusCode::kSuccess) {
-                        GGML_ASSERT(false);
-                    }
-
-                    // copy dst to host, then convert to float
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-
-                    ggml_fp16_to_fp32_row(tmp, d, d_ne);
                }
            }
+            else {
+                for (int64_t i11 = 0; i11 < ne11; i11++) {
+                    for (int64_t i10 = 0; i10 < ne10; i10++) {
+                        // very slow due to no inlining
+                        tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                    }
+                }
+            }
+
+            // copy src1 to device
+            CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
+
+            CL_CHECK(clFinish(queue));
+
+            // compute
+            cl_event ev_sgemm;
+            clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
+                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                       ne01, ne11, ne10,
+                                                       alpha,
+                                                       d_X, x_offset, ne00,
+                                                       d_Y, 0, ne10,
+                                                       beta,
+                                                       d_D, 0, ne01,
+                                                       &queue, &ev_sgemm);
+
+            if (status != clblast::StatusCode::kSuccess) {
+                GGML_ASSERT(false);
+            }
+
+            // copy dst to host, then convert to float
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+
+            ggml_fp16_to_fp32_row(tmp, d, d_ne);
        }
    }

@@ -1714,81 +1747,85 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    size_t ev_idx = 0;
    std::vector<cl_event> events;

-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy and dequantize src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                // copy src0 to device if necessary
-                if (src0->backend == GGML_BACKEND_CPU) {
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;
+
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;
+
+            // copy src0 to device if necessary
+            if (src0->backend == GGML_BACKEND_CPU) {
+                if (i02 != pi02 || i03 != pi03) {
                    events.emplace_back();
                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                } else if (src0->backend == GGML_BACKEND_GPU) {
-                    d_Q = (cl_mem) src0->extra;
-                } else {
+                    pi02 = i02;
+                    pi03 = i03;
+                }
+            } else if (src0->backend == GGML_BACKEND_GPU) {
+                d_Q = (cl_mem) src0->extra;
+            } else {
+                GGML_ASSERT(false);
+            }
+            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                // copy src1 to device
+                events.emplace_back();
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
+
+                // compute
+                const size_t global = ne01 * local;
+                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                const cl_int ncols = ne00;
+                events.emplace_back();
+                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
+                CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
+                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
+                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
+                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
+            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
+                // convert src0 to fp32 on device
+                const size_t global = x_ne / global_denom;
+                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
+                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
+
+                // copy src1 to device
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+
+                events.emplace_back();
+
+                // wait for conversion
+                CL_CHECK(clFinish(queue));
+
+                // compute
+                clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                           clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                           ne01, ne11, ne10,
+                                                           alpha,
+                                                           d_X, 0, ne00,
+                                                           d_Y, 0, ne10,
+                                                           beta,
+                                                           d_D, 0, ne01,
+                                                           &queue, events.data() + ev_idx++);
+
+                if (status != clblast::StatusCode::kSuccess) {
                    GGML_ASSERT(false);
                }
-
-                if (!mul_mat_vec) {
-                    // convert src0 to fp32 on device
-                    const size_t global = x_ne / global_denom;
-                    const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
-                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
-                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
-                }
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
-                        // copy src1 to device
-                        events.emplace_back();
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
-
-                        // compute
-                        const size_t global = ne01 * local;
-                        const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                        const cl_int ncols = ne00;
-                        events.emplace_back();
-                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
-                        CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
-                        CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
-                        CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
-                        CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                        CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
-                    } else { // CLBlast matrix matrix multiplication
-                        // copy src1 to device
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-
-                        // wait for conversion
-                        CL_CHECK(clFinish(queue));
-
-                        // compute
-                        events.emplace_back();
-                        clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                                   clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                                   ne01, ne11, ne10,
-                                                                   alpha,
-                                                                   d_X, 0, ne00,
-                                                                   d_Y, 0, ne10,
-                                                                   beta,
-                                                                   d_D, 0, ne01,
-                                                                   &queue, events.data() + ev_idx++);
-
-                        if (status != clblast::StatusCode::kSuccess) {
-                            GGML_ASSERT(false);
-                        }
-                    }
-
-                    // copy dst to host
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
-                    for (auto *event : events) {
-                        clReleaseEvent(event);
-                    }
-
-                    ev_idx = 0;
-                    events.clear();
-                }
            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
+            for (auto *event : events) {
+                clReleaseEvent(event);
+            }
+
+            ev_idx = 0;
+            events.clear();
        }
    }

@@ -13537,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
                        dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
                    }
-                } else if (!is_neox) {
+                } if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
@@ -16602,10 +16602,6 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    GGML_ASSERT(params);

-    if (tensor->op == GGML_OP_NONE) {
-        return;
-    }
-
 #ifdef GGML_USE_CUBLAS
    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
    if (skip_cpu) {
@@ -19174,7 +19170,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {

                            if (idx == -1) {
                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
-                                fclose(fout);
                                return;
                            }

@@ -20849,7 +20844,7 @@ struct gguf_kv {
 };

 struct gguf_header {
-    char magic[4];
+    uint32_t magic;
    uint32_t version;
    uint64_t n_tensors; // GGUFv2
    uint64_t n_kv;      // GGUFv2
@@ -20919,7 +20914,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
 struct gguf_context * gguf_init_empty(void) {
    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));

-    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
+    ctx->header.magic     = GGUF_MAGIC;
    ctx->header.version   = GGUF_VERSION;
    ctx->header.n_tensors = 0;
    ctx->header.n_kv      = 0;
@@ -20945,18 +20940,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // offset from start of file
    size_t offset = 0;

-    char magic[4];
+    uint32_t magic = 0;

    // check the magic before making allocations
    {
        gguf_fread_el(file, &magic, sizeof(magic), &offset);

-        for (uint32_t i = 0; i < sizeof(magic); i++) {
-            if (magic[i] != GGUF_MAGIC[i]) {
-                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
-                fclose(file);
-                return NULL;
-            }
+        if (magic != GGUF_MAGIC) {
+            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+            fclose(file);
+            return NULL;
        }
    }

@@ -20966,8 +20959,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

    // read the header
    {
-        strncpy(ctx->header.magic, magic, 4);
-
+        ctx->header.magic = magic;

        ctx->kv    = NULL;
        ctx->infos = NULL;
@@ -231,9 +231,8 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1

-#define GGUF_MAGIC "GGUF"
-
-#define GGUF_VERSION 3
+#define GGUF_MAGIC   0x46554747 // "GGUF"
+#define GGUF_VERSION 2

 #define GGUF_DEFAULT_ALIGNMENT 32

@@ -19,10 +19,9 @@ import numpy as np
 #

 GGUF_MAGIC             = 0x46554747
-GGUF_VERSION           = 3
+GGUF_VERSION           = 2
 GGUF_DEFAULT_ALIGNMENT = 32

-
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@@ -598,10 +597,6 @@ class GGMLQuantizationType(IntEnum):
    Q6_K = 14
    Q8_K = 15

-class GGUFEndian(IntEnum):
-    LITTLE = 0
-    BIG = 1
-

 class GGUFValueType(IntEnum):
    UINT8   = 0
@@ -649,41 +644,18 @@ class GGUFWriter:
    temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
    tensors: list[tuple[np.ndarray[Any, Any], int]]

-    @property
-    def pack_prefix(self):
-        if self.endianess==GGUFEndian.LITTLE:
-            return "<"
-        else:
-            return ">"
-
-    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
+    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
        self.fout = open(path, "wb")
        self.arch = arch
-        self.endianess = endianess
-        self._simple_value_packing = {
-            GGUFValueType.UINT8:   f"{self.pack_prefix}B",
-            GGUFValueType.INT8:    f"{self.pack_prefix}b",
-            GGUFValueType.UINT16:  f"{self.pack_prefix}H",
-            GGUFValueType.INT16:   f"{self.pack_prefix}h",
-            GGUFValueType.UINT32:  f"{self.pack_prefix}I",
-            GGUFValueType.INT32:   f"{self.pack_prefix}i",
-            GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
-            GGUFValueType.UINT64:  f"{self.pack_prefix}Q",
-            GGUFValueType.INT64:   f"{self.pack_prefix}q",
-            GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
-            GGUFValueType.BOOL:    "?" ,
-        }
        self.add_architecture()
        self.use_temp_file = use_temp_file
        self.tensors = []
-        endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
-        print(f"This gguf file is for {endianess_str} only")

    def write_header_to_file(self):
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
-        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
-        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
-        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
+        self.fout.write(struct.pack("<I", GGUF_VERSION))
+        self.fout.write(struct.pack("<Q", self.ti_data_count))
+        self.fout.write(struct.pack("<Q", self.kv_data_count))
        self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))

@@ -755,12 +727,25 @@ class GGUFWriter:
        self.add_key(key)
        self.add_val(val, GGUFValueType.ARRAY)

+    _simple_value_packing = {
+        GGUFValueType.UINT8:   "<B",
+        GGUFValueType.INT8:    "<b",
+        GGUFValueType.UINT16:  "<H",
+        GGUFValueType.INT16:   "<h",
+        GGUFValueType.UINT32:  "<I",
+        GGUFValueType.INT32:   "<i",
+        GGUFValueType.FLOAT32: "<f",
+        GGUFValueType.UINT64:  "<Q",
+        GGUFValueType.INT64:   "<q",
+        GGUFValueType.FLOAT64: "<d",
+        GGUFValueType.BOOL:    "?" ,
+    }
    def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
        if vtype is None:
            vtype = GGUFValueType.get_type(val)

        if add_vtype:
-            self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
+            self.kv_data += struct.pack("<I", vtype)
            self.kv_data_count += 1

        pack_fmt = self._simple_value_packing.get(vtype)
@@ -768,14 +753,14 @@ class GGUFWriter:
            self.kv_data += struct.pack(pack_fmt, val)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
+            self.kv_data += struct.pack("<Q", len(encoded_val))
            self.kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
            ltype = GGUFValueType.get_type(val[0])
            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
-            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
+            self.kv_data += struct.pack("<I", ltype)
+            self.kv_data += struct.pack("<Q", len(val))
            for item in val:
                self.add_val(item, add_vtype=False)
        else:
@@ -789,24 +774,22 @@ class GGUFWriter:
        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

        encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
+        self.ti_data += struct.pack("<Q", len(encoded_name))
        self.ti_data += encoded_name
        n_dims = len(tensor_shape)
-        self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
+        self.ti_data += struct.pack("<I", n_dims)
        for i in range(n_dims):
-            self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
        else:
            dtype = raw_dtype
-        self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
-        self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
+        self.ti_data += struct.pack("<I", dtype)
+        self.ti_data += struct.pack("<Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1

    def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
-        if self.endianess == GGUFEndian.BIG:
-            tensor.byteswap(inplace=True)
        if self.use_temp_file and self.temp_file is None:
            fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
            fp.seek(0)
@@ -832,8 +815,6 @@ class GGUFWriter:
            fp.write(bytes([0] * pad))

    def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
-        if self.endianess==GGUFEndian.BIG:
-            tensor.byteswap(inplace=True)
        self.write_padding(self.fout, self.fout.tell())
        tensor.tofile(self.fout)
        self.write_padding(self.fout, tensor.nbytes)
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.5"
+version = "0.4.4"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if !defined(__riscv) && !defined(__s390__)
+#if !defined(__riscv)
 #include <immintrin.h>
 #endif
 #endif
@@ -1018,8 +1018,8 @@ enum e_model {
 };

 static const size_t kB = 1024;
-static const size_t MB = 1024*kB;
-static const size_t GB = 1024*MB;
+static const size_t MB = kB*kB;
+static const size_t GB = kB*kB*kB;

 struct llama_hparams {
    bool     vocab_only;
@@ -1042,21 +1042,21 @@ struct llama_hparams {
    float f_max_alibi_bias;

    bool operator!=(const llama_hparams & other) const {
-        if (this->vocab_only  != other.vocab_only)  return true;
-        if (this->n_vocab     != other.n_vocab)     return true;
+        if (this->vocab_only != other.vocab_only) return true;
+        if (this->n_vocab != other.n_vocab) return true;
        if (this->n_ctx_train != other.n_ctx_train) return true;
-        if (this->n_embd      != other.n_embd)      return true;
-        if (this->n_head      != other.n_head)      return true;
-        if (this->n_head_kv   != other.n_head_kv)   return true;
-        if (this->n_layer     != other.n_layer)     return true;
-        if (this->n_rot       != other.n_rot)       return true;
-        if (this->n_ff        != other.n_ff)        return true;
+        if (this->n_embd != other.n_embd) return true;
+        if (this->n_head != other.n_head) return true;
+        if (this->n_head_kv != other.n_head_kv) return true;
+        if (this->n_layer != other.n_layer) return true;
+        if (this->n_rot != other.n_rot) return true;
+        if (this->n_ff != other.n_ff) return true;

        const float EPSILON = 1e-9;

-        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
-        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
+        if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
+        if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
+        if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;

        return false;
@@ -1195,11 +1195,11 @@ struct llama_vocab {
    id special_sep_id = -1;
    id special_pad_id = -1;

-    id linefeed_id       = 13;
+    id linefeed_id = 13;
    id special_prefix_id = 32007;
    id special_middle_id = 32009;
    id special_suffix_id = 32008;
-    id special_eot_id    = 32010;
+    id special_eot_id = 32010;

    int find_bpe_rank(std::string token_left, std::string token_right) const {
        replace_all(token_left,  " ",  "\u0120");
@@ -1359,7 +1359,10 @@ static bool llama_kv_cache_init(
    cache.cells.clear();
    cache.cells.resize(n_ctx);

-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
+    // TODO: this should be:
+    //       cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
+    //       change it and test that it works
+    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
    memset(cache.buf.data, 0, cache.buf.size);

    struct ggml_init_params params;
@@ -5815,33 +5818,6 @@ static struct ggml_cgraph * llama_build_graph(
            GGML_ASSERT(false);
    }

-#if 1
-    for (int i = 0; i < result->n_nodes; ++i) {
-        struct ggml_tensor * node = result->nodes[i];
-        if (getenv("SKIP_KQ_ALL")) {
-            if (
-                    strcmp(node->name, "KQ")  == 0 ||
-                    strcmp(node->name, "KQ_scaled") == 0 ||
-                    strcmp(node->name, "KQ_masked") == 0 ||
-                    strcmp(node->name, "KQ_soft_max") == 0 ||
-                    strcmp(node->name, "KQV") == 0 ||
-                    false) {
-                //printf("skipping %s\n", dst->name);
-                node->op  = GGML_OP_NONE;
-            }
-        }
-        if (getenv("SKIP_KQ_KQV")) {
-            if (
-                    strcmp(node->name, "KQ")  == 0 ||
-                    strcmp(node->name, "KQV") == 0 ||
-                    false) {
-                //printf("skipping %s\n", dst->name);
-                node->op  = GGML_OP_NONE;
-            }
-        }
-    }
-#endif
-
    return result;
 }

@@ -6348,6 +6324,7 @@ struct llm_tokenizer_bpe {
                llm_symbol sym;
                size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
                sym.text = word.c_str() + offset;
+                sym.n = 1;
                sym.n = char_len;
                offset += sym.n;
                sym.prev = index - 1;
@@ -7077,7 +7054,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    std::vector<llama_grammar_candidate> rejects;

    if (stack.empty()) {
-        for (const auto & tok : candidates) {
+        for (auto tok : candidates) {
            if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
                rejects.push_back(tok);
            }
@@ -7088,7 +7065,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    const llama_grammar_element * stack_pos = stack.back();

    std::vector<llama_grammar_candidate> next_candidates;
-    for (const auto & tok : candidates) {
+    for (auto tok : candidates) {
        if (*tok.code_points == 0) {
            // reached end of full codepoints in token, reject iff it ended in a partial sequence
            // that cannot satisfy this position in grammar
@@ -7114,7 +7091,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    llama_grammar_advance_stack(rules, stack_after, next_stacks);

    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
-    for (const auto & tok : next_rejects) {
+    for (auto tok : next_rejects) {
        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
    }

@@ -7441,15 +7418,37 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
    llama_sample_temp(ctx, candidates_p, temp);
 }

-void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present) {
-    if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
+void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
+    if (last_tokens_size == 0 || penalty == 1.0f) {
+        return;
+    }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    for (size_t i = 0; i < candidates->size; ++i) {
+        const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
+        if (token_iter == last_tokens + last_tokens_size) {
+            continue;
+        }
+
+        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+        if (candidates->data[i].logit <= 0) {
+            candidates->data[i].logit *= penalty;
+        } else {
+            candidates->data[i].logit /= penalty;
+        }
+    }
+
+    candidates->sorted = false;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
+    if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
        return;
    }

@@ -7457,28 +7456,19 @@ void llama_sample_repetition_penalties(

    // Create a frequency map to count occurrences of each token in last_tokens
    std::unordered_map<llama_token, int> token_count;
-    for (size_t i = 0; i < penalty_last_n; ++i) {
-        token_count[last_tokens[i]]++;
+    for (size_t i = 0; i < last_tokens_size; ++i) {
+        token_count[last_tokens_p[i]]++;
    }

    // Apply frequency and presence penalties to the candidates
    for (size_t i = 0; i < candidates->size; ++i) {
-        const auto token_iter = token_count.find(candidates->data[i].id);
+        auto token_iter = token_count.find(candidates->data[i].id);
        if (token_iter == token_count.end()) {
            continue;
        }

-        const int count = token_iter->second;
-
-        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
-        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
-        if (candidates->data[i].logit <= 0) {
-            candidates->data[i].logit *= penalty_repeat;
-        } else {
-            candidates->data[i].logit /= penalty_repeat;
-        }
-
-        candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
+        int count = token_iter->second;
+        candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
    }

    candidates->sorted = false;
@@ -560,15 +560,21 @@ extern "C" {
    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);

    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_repetition_penalties(
+    LLAMA_API void llama_sample_repetition_penalty(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present);
+                          size_t   last_tokens_size,
+                          float    penalty);
+
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   last_tokens_size,
+                           float   alpha_frequency,
+                           float   alpha_presence);

    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -4,9 +4,7 @@

 #undef NDEBUG
 #include <cassert>
-#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
-#endif
 #include <cmath>
 #include <cstdint>
 #include <cstring>
@@ -8,9 +8,11 @@
 #include <cmath>
 #include <numeric>
 #include <cassert>
+#include <iostream>
 #include <vector>
 #include <algorithm>

+
 static void dump(const llama_token_data_array * candidates) {
    for (size_t i = 0; i < candidates->size; i++) {
        printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
@@ -19,6 +21,7 @@ static void dump(const llama_token_data_array * candidates) {

 #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)

+
 static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@@ -34,12 +37,13 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
    llama_sample_top_k(nullptr, &candidates_p, k, 1);
    DUMP(&candidates_p);

-    GGML_ASSERT(candidates_p.size == expected_probs.size());
+    assert(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
    }
 }

+
 static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@@ -55,12 +59,13 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
    llama_sample_top_p(nullptr, &candidates_p, p, 1);
    DUMP(&candidates_p);

-    GGML_ASSERT(candidates_p.size == expected_probs.size());
+    assert(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }

+
 static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@@ -75,12 +80,13 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
    llama_sample_tail_free(nullptr, &candidates_p, z, 1);
    DUMP(&candidates_p);

-    GGML_ASSERT(candidates_p.size == expected_probs.size());
+    assert(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }

+
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@@ -95,17 +101,18 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
    llama_sample_typical(nullptr, &candidates_p, p, 1);
    DUMP(&candidates_p);

-    GGML_ASSERT(candidates_p.size == expected_probs.size());
+    assert(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }

-static void test_repetition_penalties(
+
+static void test_repetition_penalty(
    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
+    const std::vector<float> & expected_probs, float penalty
 ) {
-    GGML_ASSERT(probs.size() == expected_probs.size());
+    assert(probs.size() == expected_probs.size());

    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@@ -118,13 +125,41 @@ static void test_repetition_penalties(
    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    llama_sample_softmax(nullptr, &candidates_p);
    DUMP(&candidates_p);
-    llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
+    llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
    llama_sample_softmax(nullptr, &candidates_p);
    DUMP(&candidates_p);

-    GGML_ASSERT(candidates_p.size == expected_probs.size());
+    assert(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6);
+    }
+}
+
+
+static void test_frequency_presence_penalty(
+    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
+) {
+    assert(probs.size() == expected_probs.size());
+
+    size_t n_vocab = probs.size();
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        float logit = log(probs[token_id]);
+        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    llama_sample_softmax(nullptr, &candidates_p);
+    // DUMP(&candidates_p);
+    llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
+    llama_sample_softmax(nullptr, &candidates_p);
+    // DUMP(&candidates_p);
+
+    assert(candidates_p.size == expected_probs.size());
+    for (size_t i = 0; i < candidates_p.size; i++) {
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }

@@ -146,13 +181,13 @@ int main(void) {
    test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
    test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);

-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
+    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
+    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
+    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);

-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
+    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
+    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
+    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);

    printf("OK\n");
Author	SHA1	Message	Date
Georgi Gerganov	ad2727d091	Merge branch 'master' into speculative-tree ggml-ci	2023-10-18 10:50:58 +03:00
Georgi Gerganov	bd9451ca2a	Merge branch 'master' into speculative-tree	2023-10-17 19:31:40 +03:00
Georgi Gerganov	010c52ec59	Merge branch 'master' into speculative-tree	2023-10-17 17:24:11 +03:00
Georgi Gerganov	e6dd81f0bc	speculative : fix the n_drafted fix + p constants	2023-10-17 17:04:31 +03:00
Georgi Gerganov	f07cd35da4	speculative : fix off-by-one for n_drafted	2023-10-17 11:40:26 +03:00
Georgi Gerganov	373d782d42	minor : comments + rename ggml-ci	2023-10-16 18:17:31 +03:00
Georgi Gerganov	1c626e2fe1	speculative : minor refactor ggml-ci	2023-10-16 12:47:37 +03:00
Georgi Gerganov	360a333145	common : add llama_batch_add() and llama_batch_clear() helpers	2023-10-16 12:41:33 +03:00
Georgi Gerganov	005949109d	prompts : add assistant.txt	2023-10-16 12:41:14 +03:00
Georgi Gerganov	5b34bfa2e6	swift : try to fix build ggml-ci	2023-10-16 00:39:57 +03:00
Georgi Gerganov	b8acb6c9b8	swift : fix build ggml-ci	2023-10-16 00:26:16 +03:00
Georgi Gerganov	b5554b9e05	sampling : fix malloc ggml-ci	2023-10-16 00:14:49 +03:00
Georgi Gerganov	0d96efabb5	batched : fix n_seq_id	2023-10-16 00:03:52 +03:00
Georgi Gerganov	7e48e21b1f	examples : fix build after sampling refactoring ggml-ci	2023-10-15 23:36:31 +03:00
Georgi Gerganov	4a7f43f28c	speculative : refactor sampling	2023-10-15 22:53:54 +03:00
Georgi Gerganov	32a67cbd16	speculative : reuse the n_parallel CLI param	2023-10-15 19:35:59 +03:00
Georgi Gerganov	4de5a2d473	speculative : add tree-based sampling support ggml-ci	2023-10-14 17:54:02 +03:00
Georgi Gerganov	5261aee8d8	sampling : one sequence per sampling context ggml-ci	2023-10-12 20:36:44 +03:00