server : add comment about changing slot_state to bool

server : fix slot reuse
server : apply fix from #3722
2026-07-02 18:47:43 +02:00 · 2023-10-22 22:24:39 +03:00 · 2023-10-22 21:57:23 +03:00 · 2023-10-22 21:05:45 +03:00 · 2023-10-22 20:09:25 +03:00 · 2023-10-22 20:03:35 +03:00
46 changed files with 438 additions and 1265 deletions
@@ -1,7 +1,8 @@
 ---
-name: Bug template
-about: Used to report bugs in llama.cpp
-labels: ["bug"]
+name: Issue and enhancement template
+about: Used to report issues and request enhancements for llama.cpp
+title: "[User] Insert summary of your issue or enhancement.."
+labels: ''
 assignees: ''

 ---
@@ -45,7 +46,7 @@ $ g++ --version

 # Failure Information (for bugs)

-Please help provide information about the failure / bug.
+Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.

 # Steps to Reproduce

@@ -1,28 +0,0 @@
---
-name: Enhancement template
-about: Used to request enhancements for llama.cpp
-labels: ["enhancement"]
-assignees: ''
-
---
-
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
-
-# Feature Description
-
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-
-# Motivation
-
-Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-
-# Possible Implementation
-
-If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
@@ -82,7 +82,6 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
-option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
@@ -306,9 +305,6 @@ if (LLAMA_CUBLAS)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
-        if (LLAMA_CUDA_FORCE_MMQ)
-            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-        endif()
        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -335,7 +331,6 @@ if (LLAMA_CUBLAS)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
        else()
            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
-            #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -409,9 +404,6 @@ if (LLAMA_HIPBLAS)
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
        endif()
-        if (LLAMA_CUDA_FORCE_MMQ)
-            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
-        endif()
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -397,9 +397,6 @@ endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
 	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
-ifdef LLAMA_CUDA_FORCE_MMQ
-	NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
-endif # LLAMA_CUDA_FORCE_MMQ
 ifdef LLAMA_CUDA_DMMV_X
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
@@ -101,7 +101,7 @@ as the main playground for developing new features for the [ggml](https://github

 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
+- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -224,7 +224,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            sparams.temp = std::stof(argv[i]);
-            sparams.temp = std::max(sparams.temp, 0.0f);
        } else if (arg == "--tfs") {
            if (++i >= argc) {
                invalid_param = true;
@@ -633,7 +632,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
        process_escapes(params.input_suffix);
-        process_escapes(sparams.cfg_negative_prompt);
        for (auto & antiprompt : params.antiprompt) {
            process_escapes(antiprompt);
        }
@@ -744,7 +742,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif // GGML_USE_CUBLAS
 #endif
    printf("  --verbose-prompt      print prompt before generation\n");
-    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
+    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
@@ -881,13 +879,13 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    }

    if (params.ignore_eos) {
-        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+        params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY;
    }

    {
        LOG("warming up the model with an empty run\n");

-        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
+        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_kv_cache_tokens_rm(lctx, -1, -1);
        llama_reset_timings(lctx);
@@ -942,7 +940,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
 }

 std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
+    const llama_token bos_id = llama_token_bos(ctx);

    std::string piece;
    std::string result;
@@ -1187,7 +1185,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);

-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
+    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");

@@ -97,23 +97,22 @@
    #define LOG_TEE_TARGET stderr
 #endif

-// NOTE: currently disabled as it produces too many log files
 // Utility to obtain "pid" like unique process id and use it when creating log files.
-//inline std::string log_get_pid()
-//{
-//    static std::string pid;
-//    if (pid.empty())
-//    {
-//        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-//        //  it's not the same as "pid" but is unique enough to solve multiple instances
-//        //  trying to write to the same log.
-//        std::stringstream ss;
-//        ss << std::this_thread::get_id();
-//        pid = ss.str();
-//    }
-//
-//    return pid;
-//}
+inline std::string log_get_pid()
+{
+    static std::string pid;
+    if (pid.empty())
+    {
+        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+        //  it's not the same as "pid" but is unique enough to solve multiple instances
+        //  trying to write to the same log.
+        std::stringstream ss;
+        ss << std::this_thread::get_id();
+        pid = ss.str();
+    }
+
+    return pid;
+}

 // Utility function for generating log file names with unique id based on thread id.
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
@@ -127,8 +126,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
    std::stringstream buf;

    buf << log_file_basename;
-    //buf << ".";
-    //buf << log_get_pid();
+    buf << ".";
+    buf << log_get_pid();
    buf << ".";
    buf << log_file_extension;

@@ -147,7 +147,7 @@ llama_token llama_sampling_sample(

    // apply penalties
    if (!prev.empty()) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
+        const float nl_logit = logits[llama_token_nl(ctx_main)];

        llama_sample_repetition_penalties(ctx_main, &cur_p,
                prev.data() + prev.size() - penalty_last_n,
@@ -155,7 +155,7 @@ llama_token llama_sampling_sample(

        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
+                if (cur_p.data[idx].id == llama_token_nl(ctx_main)) {
                    cur_p.data[idx].logit = nl_logit;
                    break;
                }
@@ -167,12 +167,8 @@ llama_token llama_sampling_sample(
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

-    if (temp < 0.0) {
-        // greedy sampling, with probs
-        llama_sample_softmax(ctx_main, &cur_p);
-        id = cur_p.data[0].id;
-    } else if (temp == 0.0) {
-        // greedy sampling, no probs
+    if (temp <= 0) {
+        // greedy sampling
        id = llama_sample_token_greedy(ctx_main, &cur_p);
    } else {
        if (mirostat == 1) {
@@ -236,8 +236,8 @@ int64_t get_example_targets_batch(
    int64_t used_samples = 0;

    ggml_set_f32(target_probs, 0.0f);
-    llama_token bos = llama_token_bos(llama_get_model(lctx));
-    llama_token eos = llama_token_eos(llama_get_model(lctx));
+    llama_token bos = llama_token_bos(lctx);
+    llama_token eos = llama_token_eos(lctx);
    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
    for (int k=0; k<n_batch; ++k) {
        // printf("%s: batch %d\n", __func__, k);
@@ -924,7 +924,7 @@ size_t tokenize_file(
        for (llama_token token=0; token < n_vocab; ++token) {
            max_token_text_size = std::max(
                max_token_text_size,
-                strlen(llama_token_get_text(llama_get_model(lctx), token)));
+                strlen(llama_token_get_text(lctx, token)));
        }

        // upper bound of context byte length.
@@ -110,7 +110,7 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 print("hello print: ",hparams["architectures"][0])
-if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
+if hparams["architectures"][0] != "BaichuanForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])

    sys.exit()
@@ -230,7 +230,7 @@ gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)

-special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
+special_vocab = gguf.SpecialVocab(dir_model)
 special_vocab.add_to_gguf(gguf_writer)

 # TENSORS
@@ -118,27 +118,18 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size

-added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

 for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    scores.append(0.0)  # dummy
+    toktypes.append(gguf.TokenType.NORMAL)

 gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)

-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
 special_vocab.add_to_gguf(gguf_writer)

 # TENSORS
@@ -152,7 +152,7 @@ gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)

-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)

 # TENSORS
@@ -123,27 +123,18 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size

-added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

 for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    scores.append(0.0) # dummy
+    toktypes.append(gguf.TokenType.NORMAL)

 gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)

-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)

 # TENSORS
@@ -388,9 +388,7 @@ def handle_metadata(cfg, hp):
        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
        cfg.vocabtype )
    # FIXME: Respect cfg.vocab_dir?
-    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
-        load_merges = cfg.vocabtype == 'bpe',
-        n_vocab = vocab.vocab_size)
+    svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
    convert.check_vocab_size(params, vocab)
    return (params, vocab, svocab)

@@ -128,27 +128,18 @@ vocab_size = hparams["vocab_size"]
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)

-added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

 for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    scores.append(0.0) # dummy
+    toktypes.append(gguf.TokenType.NORMAL)

 gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)

-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)

 # TENSORS
@@ -139,27 +139,18 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size

-added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

 for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    scores.append(0.0) # dummy
+    toktypes.append(gguf.TokenType.NORMAL)

 gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)

-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
 special_vocab.add_to_gguf(gguf_writer)

 # TENSORS
@@ -111,26 +111,18 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size

-added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

 for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    scores.append(0.0) # dummy
+    toktypes.append(gguf.TokenType.NORMAL)

 gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)

 # TENSORS
@@ -366,19 +366,16 @@ class SentencePieceVocab:
            added_tokens = {}

        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")

-        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
-        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids   = sorted(new_tokens.keys())
-
-        if expected_new_ids != actual_new_ids:
-            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
-
-        # Token pieces that were added to the base vocabulary.
-        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base    = vocab_size
-        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer    = fname_tokenizer
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens

    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -1166,13 +1163,10 @@ def main(args_in: list[str] | None = None) -> None:

    vocab: Vocab
    if args.vocab_only:
-        if not args.outfile:
-            raise ValueError("need --outfile if using --vocab-only")
+        assert args.outfile, "need --outfile if using --vocab-only"
        # FIXME: Try to respect vocab_dir somehow?
        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-            load_merges = args.vocabtype == 'bpe',
-            n_vocab = vocab.vocab_size)
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
        outfile = args.outfile
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
        print(f"Wrote {outfile}")
@@ -1184,9 +1178,7 @@ def main(args_in: list[str] | None = None) -> None:
        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
        vocab = load_vocab(vocab_dir, args.vocabtype)
    # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-        load_merges = args.vocabtype == 'bpe',
-        n_vocab = vocab.vocab_size)
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')

    model   = model_plus.model
    model   = convert_model_names(model, params)
@@ -154,10 +154,6 @@ int main(int argc, char ** argv) {
        }
    }

-    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
-    LOG_TEE("\n");
-
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");

@@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
        return 1 ;
    }

@@ -21,9 +21,6 @@ int main(int argc, char ** argv) {
    // total length of the sequences including the prompt
    int n_len = 32;

-    // number of layers to offload to the GPU
-    int n_gpu_layers = 0;
-
    if (argc >= 2) {
        params.model = argv[1];
    }
@@ -40,10 +37,6 @@ int main(int argc, char ** argv) {
        n_len = std::atoi(argv[4]);
    }

-    if (argc >= 6) {
-        n_gpu_layers = std::atoi(argv[5]);
-    }
-
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }
@@ -56,7 +49,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = llama_model_default_params();

-    model_params.n_gpu_layers = n_gpu_layers;
+    // model_params.n_gpu_layers = 99; // offload all layers to the GPU

    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

@@ -187,7 +180,7 @@ int main(int argc, char ** argv) {
            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of stream? -> mark the stream as finished
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
                i_batch[i] = -1;
                LOG_TEE("\n");
                if (n_parallel > 1) {
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
 }

 // Function matching type llama_beam_search_callback_fn_t.
@@ -246,14 +246,14 @@ int main(int argc, char ** argv) {
    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
        inp_sfx.erase(inp_sfx.begin());
    }
-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
+    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
    if (add_bos) {
-        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
+        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
    }
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
    embd_inp = inp_pfx;
    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-    embd_inp.push_back(llama_token_middle(model));
+    embd_inp.push_back(llama_token_middle(ctx));

    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
@@ -261,7 +261,7 @@ int main(int argc, char ** argv) {

    // Should not run without any tokens
    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(model));
+        embd_inp.push_back(llama_token_bos(ctx));
        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

@@ -577,10 +577,10 @@ int main(int argc, char ** argv) {
        if ((int) embd_inp.size() <= n_consumed) {

            // deal with eot token in infill mode
-            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
+            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(ctx) || is_interacting) && params.interactive){
                if(is_interacting && !params.interactive_first) {
                    // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
                }
                fflush(stdout);
                printf("\n");
@@ -627,14 +627,14 @@ int main(int argc, char ** argv) {
                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
                    inp_sfx.erase(inp_sfx.begin());
                }
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
+                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
                if (add_bos) {
-                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
+                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
                }
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
                embd_inp = inp_pfx;
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                embd_inp.push_back(llama_token_middle(model));
+                embd_inp.push_back(llama_token_middle(ctx));
                embd.clear();
                embd_guidance.clear();
                n_remain = params.n_predict;
@@ -644,7 +644,7 @@ int main(int argc, char ** argv) {
                is_interacting = false;
            }
            // deal with end of text token in interactive mode
-            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -661,7 +661,7 @@ int main(int argc, char ** argv) {

                if (params.input_prefix_bos) {
                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
+                    embd_inp.push_back(llama_token_bos(ctx));
                }

                std::string buffer;
@@ -724,7 +724,7 @@ int main(int argc, char ** argv) {
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
            break;
        }

@@ -736,7 +736,7 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+        printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
        fflush(stdout);
    }

@@ -933,7 +933,7 @@ struct sql_printer : public printer {
 };

 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
-    std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
+    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
    int n_processed = 0;

    llama_set_n_threads(ctx, n_threads, n_threads);
@@ -946,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 }

 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
-    llama_token token = llama_token_bos(llama_get_model(ctx));
+    llama_token token = llama_token_bos(ctx);

    llama_set_n_threads(ctx, n_threads, n_threads);

@@ -137,7 +137,7 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
 inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
    int id = sample_id(ctx_llama, params);
    static std::string ret;
-    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+    if (id == llama_token_eos(ctx_llama)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
@@ -16,8 +16,6 @@ add_library(common OBJECT
    ${_common_path}/console.cpp
    ${_common_path}/grammar-parser.h
    ${_common_path}/grammar-parser.cpp
-    ${_common_path}/sampling.h
-    ${_common_path}/sampling.cpp
    )

 # WARNING: because build-info.h is auto-generated, it will only
@@ -248,7 +248,7 @@ int main(int argc, char ** argv) {

    // Should not run without any tokens
    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(model));
+        embd_inp.push_back(llama_token_bos(ctx));
        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

@@ -693,7 +693,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -720,7 +720,7 @@ int main(int argc, char ** argv) {

                if (params.input_prefix_bos) {
                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
+                    embd_inp.push_back(llama_token_bos(ctx));
                }

                std::string buffer;
@@ -761,9 +761,6 @@ int main(int argc, char ** argv) {
                        n_consumed = embd_inp.size();
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }
-                    if (params.escape) {
-                        process_escapes(buffer);
-                    }

                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
@@ -804,7 +801,7 @@ int main(int argc, char ** argv) {
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
            LOG_TEE(" [end of text]\n");
            break;
        }
@@ -347,7 +347,7 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(model) ||
+                        (id == llama_token_eos(ctx) ||
                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                         client.response.find("User:") != std::string::npos ||
                         client.response.find('\n') != std::string::npos)) {
@@ -227,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = llama_token_bos(ctx);
            }

            const auto batch_logits = llama_get_logits(ctx);
@@ -350,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = llama_token_bos(ctx);
            }

            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
@@ -454,7 +454,7 @@ struct llama_client_slot
    }

    void release() {
-        if (state == IDLE || state == PROCESSING)
+        if (state == PROCESSING)
        {
            t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
            command = RELEASE;
@@ -726,7 +726,7 @@ struct llama_server_context

        if (json_value(data, "ignore_eos", false))
        {
-            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+            slot->sparams.logit_bias[llama_token_eos(ctx)] = -INFINITY;
        }

        const auto &logit_bias = data.find("logit_bias");
@@ -754,7 +754,6 @@ struct llama_server_context
        }

        slot->params.antiprompt.clear();
-
        const auto &stop = data.find("stop");
        if (stop != data.end() && stop->is_array())
        {
@@ -868,7 +867,7 @@ struct llama_server_context

        kv_cache_clear();

-        for (int i = 0; i < (int) system_tokens.size(); ++i)
+        for (int32_t i = 0; i < batch.n_tokens; ++i)
        {
            llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
        }
@@ -895,8 +894,16 @@ struct llama_server_context
        {
            slot.release();
        }
+        wait_all_are_idle();
+        all_slots_are_idle = true;

+        // wait until system prompt load
        system_need_update = true;
+        while (system_need_update)
+        {
+            std::this_thread::sleep_for(std::chrono::milliseconds(5));
+        }
+        // system prompt loaded, continue
    }

    void process_system_prompt_data(const json &sys_props) {
@@ -908,6 +915,26 @@ struct llama_server_context
        {
            notify_system_prompt_changed();
        }
+        else
+        {
+            system_need_update = true;
+        }
+    }
+
+    void wait_all_are_idle() {
+        bool wait = true;
+        while (wait)
+        {
+            wait = false;
+            for (auto &slot : slots)
+            {
+                if (!slot.available())
+                {
+                    wait = true;
+                    break;
+                }
+            }
+        }
    }

    static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
@@ -938,6 +965,7 @@ struct llama_server_context
                    slot.has_next_token = false;
                }
                stop_pos = pos;
+
            }
        }

@@ -1028,7 +1056,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }

-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
+        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
@@ -1102,7 +1130,7 @@ struct llama_server_context

    json get_formated_generation(llama_client_slot &slot)
    {
-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(ctx));
        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
        return json {
@@ -1416,7 +1444,7 @@ struct llama_server_context
        process_tasks();

        // update the system prompt wait until all slots are idle state
-        if (system_need_update && all_slots_are_idle)
+        if (system_need_update)
        {
            LOG_TEE("updating system prompt\n");
            update_system_prompt();
@@ -1470,7 +1498,7 @@ struct llama_server_context
        for (auto & slot : slots)
        {
            // release the slot
-            if (slot.command == RELEASE)
+            if (slot.state == PROCESSING && slot.command == RELEASE)
            {
                slot.state = IDLE;
                slot.command = NONE;
@@ -1481,7 +1509,7 @@ struct llama_server_context
                continue;
            }

-            if (slot.state == IDLE)
+            if (slot.state == IDLE || slot.command == RELEASE)
            {
                continue;
            }
@@ -1502,17 +1530,6 @@ struct llama_server_context
        {
            for (auto & slot : slots)
            {
-                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
-
-                // empty prompt passed -> release the slot and send empty response
-                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
-                {
-                    slot.release();
-                    slot.print_timings();
-                    send_final_response(slot);
-                    continue;
-                }
-
                // need process the prompt
                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
                {
@@ -1538,11 +1555,11 @@ struct llama_server_context
                            suffix_tokens.erase(suffix_tokens.begin());
                        }

-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
+                        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
                        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
-                        prefix_tokens.push_back(llama_token_middle(model));
+                        prefix_tokens.push_back(llama_token_middle(ctx));
                        prompt_tokens = prefix_tokens;
                    }
                    else
@@ -1732,8 +1749,8 @@ struct llama_server_context
                if (!process_token(result, slot))
                {
                    slot.release();
-                    slot.print_timings();
                    send_final_response(slot);
+                    slot.print_timings();
                }

                slot.i_batch = -1;
@@ -1749,16 +1766,15 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("usage: %s [options]\n", argv0);
    printf("\n");
    printf("options:\n");
-    printf("  -h, --help                show this help message and exit\n");
-    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N,  --threads N        number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -c N,  --ctx-size N       size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N       RoPE frequency scaling factor (default: loaded from model)\n");
-    printf("  -b N,  --batch-size N     batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
+    printf("  -h, --help            show this help message and exit\n");
+    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_mlock_supported())
    {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@@ -1908,15 +1924,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_threads = std::stoi(argv[i]);
        }
-        else if (arg == "--threads-batch" || arg == "-tb")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch = std::stoi(argv[i]);
-        }
        else if (arg == "-b" || arg == "--batch-size")
        {
            if (++i >= argc)
@@ -2278,7 +2285,7 @@ int main(int argc, char **argv)
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
-                    if (!result.error && result.stop) {
+                    if(!result.error && result.stop) {
                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
                    }
                    else
@@ -2305,7 +2312,7 @@ int main(int argc, char **argv)
                                {
                                    return false;
                                }
-                                if (result.stop) {
+                                if(result.stop) {
                                    break;
                                }
                            } else {
@@ -95,8 +95,13 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(512, 0, 1);

    // evaluate the initial prompt
-    for (size_t i = 0; i < tokens_list.size(); i++) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+    batch.n_tokens = tokens_list.size();
+
+    for (int32_t i = 0; i < batch.n_tokens; i++) {
+        batch.token[i]  = tokens_list[i];
+        batch.pos[i]    = i;
+        batch.seq_id[i] = 0;
+        batch.logits[i] = false;
    }

    // llama_decode will output logits only for the last token of the prompt
@@ -133,7 +138,7 @@ int main(int argc, char ** argv) {
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
                LOG_TEE("\n");

                break;
@@ -143,10 +148,15 @@ int main(int argc, char ** argv) {
            fflush(stdout);

            // prepare the next batch
-            llama_batch_clear(batch);
+            batch.n_tokens = 0;

            // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
+            batch.token [batch.n_tokens] = new_token_id;
+            batch.pos   [batch.n_tokens] = n_cur;
+            batch.seq_id[batch.n_tokens] = 0;
+            batch.logits[batch.n_tokens] = true;
+
+            batch.n_tokens += 1;

            n_decode += 1;
        }
@@ -8,9 +8,6 @@
 #include <string>
 #include <vector>

-#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
-#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
-
 struct seq_draft {
    bool active   = false;
    bool drafting = false;
@@ -67,33 +64,6 @@ int main(int argc, char ** argv) {
    params.n_gpu_layers = params.n_gpu_layers_draft;
    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);

-    {
-        const int n_vocab_tgt = llama_n_vocab(model_tgt);
-        const int n_vocab_dft = llama_n_vocab(model_dft);
-        const int vocab_diff  = n_vocab_tgt > n_vocab_dft
-            ? n_vocab_tgt - n_vocab_dft
-            : n_vocab_dft - n_vocab_tgt;
-
-        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
-            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
-            return 1;
-        }
-
-        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
-            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
-            const char * token_text_dft = llama_token_get_text(model_dft, i);
-            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
-                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
-                        llama_token_to_piece(ctx_tgt, i).c_str(),
-                        llama_token_to_piece(ctx_dft, i).c_str());
-                return 1;
-            }
-        }
-    }
-
    // tokenize the prompt
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
@@ -148,7 +118,7 @@ int main(int argc, char ** argv) {
    std::vector<seq_draft> drafts(n_seq_dft);

    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    params.sparams.temp = -1.0f;    // force greedy sampling with probs for the draft model
+    params.sparams.temp = std::max(0.01f, params.sparams.temp);

    for (int s = 0; s < n_seq_dft; ++s) {
        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
@@ -193,7 +163,7 @@ int main(int argc, char ** argv) {
            printf("%s", token_str.c_str());
            fflush(stdout);

-            if (id == llama_token_eos(model_tgt)) {
+            if (id == llama_token_eos(ctx_tgt)) {
                has_eos = true;
            }

@@ -257,7 +227,6 @@ int main(int argc, char ** argv) {
            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);

            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
            llama_decode         (ctx_dft, batch_dft);

            ++n_past_dft;
@@ -401,7 +370,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
            }

-            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
            llama_decode(ctx_tgt, batch_tgt);
            ++n_past_tgt;
        }
@@ -29,8 +29,6 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
 #define cublasHandle_t hipblasHandle_t
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
@@ -87,24 +85,6 @@
 #define CC_OFFSET_AMD 1000000
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)

-// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
-// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
-// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
-// -  7B quantum model: +100-200 MB
-// - 13B quantum model: +200-400 MB
-//
-//#define GGML_CUDA_FORCE_MMQ
-
-// TODO: improve this to be correct for more hardware
-//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-//       probably other such cases, and not sure what happens on AMD hardware
-#if !defined(GGML_CUDA_FORCE_MMQ)
-#define CUDA_USE_TENSOR_CORES
-#endif
-
-// max batch size to use MMQ kernels when tensor cores are available
-#define MMQ_MAX_BATCH_SIZE 32
-
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300

@@ -488,6 +468,7 @@ static int g_device_count = -1;
 static int g_main_device = 0;
 static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+static bool g_mul_mat_q = true;

 static void * g_scratch_buffer = nullptr;
 static size_t g_scratch_size = 0; // disabled by default
@@ -3571,15 +3552,9 @@ static __device__ __forceinline__ void mul_mat_q(
 #define  MMQ_X_Q4_0_RDNA1  64
 #define  MMQ_Y_Q4_0_RDNA1  64
 #define NWARPS_Q4_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_0_AMPERE 4
-#define  MMQ_Y_Q4_0_AMPERE 32
-#define NWARPS_Q4_0_AMPERE 4
-#else
 #define  MMQ_X_Q4_0_AMPERE 64
 #define  MMQ_Y_Q4_0_AMPERE 128
 #define NWARPS_Q4_0_AMPERE 4
-#endif
 #define  MMQ_X_Q4_0_PASCAL 64
 #define  MMQ_Y_Q4_0_PASCAL 64
 #define NWARPS_Q4_0_PASCAL 8
@@ -3638,15 +3613,9 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q4_1_RDNA1  64
 #define  MMQ_Y_Q4_1_RDNA1  64
 #define NWARPS_Q4_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_1_AMPERE 4
-#define  MMQ_Y_Q4_1_AMPERE 32
-#define NWARPS_Q4_1_AMPERE 4
-#else
 #define  MMQ_X_Q4_1_AMPERE 64
 #define  MMQ_Y_Q4_1_AMPERE 128
 #define NWARPS_Q4_1_AMPERE 4
-#endif
 #define  MMQ_X_Q4_1_PASCAL 64
 #define  MMQ_Y_Q4_1_PASCAL 64
 #define NWARPS_Q4_1_PASCAL 8
@@ -3707,15 +3676,9 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q5_0_RDNA1  64
 #define  MMQ_Y_Q5_0_RDNA1  64
 #define NWARPS_Q5_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_0_AMPERE 4
-#define  MMQ_Y_Q5_0_AMPERE 32
-#define NWARPS_Q5_0_AMPERE 4
-#else
 #define  MMQ_X_Q5_0_AMPERE 128
 #define  MMQ_Y_Q5_0_AMPERE 64
 #define NWARPS_Q5_0_AMPERE 4
-#endif
 #define  MMQ_X_Q5_0_PASCAL 64
 #define  MMQ_Y_Q5_0_PASCAL 64
 #define NWARPS_Q5_0_PASCAL 8
@@ -3774,15 +3737,9 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q5_1_RDNA1  64
 #define  MMQ_Y_Q5_1_RDNA1  64
 #define NWARPS_Q5_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_1_AMPERE 4
-#define  MMQ_Y_Q5_1_AMPERE 32
-#define NWARPS_Q5_1_AMPERE 4
-#else
 #define  MMQ_X_Q5_1_AMPERE 128
 #define  MMQ_Y_Q5_1_AMPERE 64
 #define NWARPS_Q5_1_AMPERE 4
-#endif
 #define  MMQ_X_Q5_1_PASCAL 64
 #define  MMQ_Y_Q5_1_PASCAL 64
 #define NWARPS_Q5_1_PASCAL 8
@@ -3841,15 +3798,9 @@ mul_mat_q5_1(
 #define  MMQ_X_Q8_0_RDNA1  64
 #define  MMQ_Y_Q8_0_RDNA1  64
 #define NWARPS_Q8_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q8_0_AMPERE 4
-#define  MMQ_Y_Q8_0_AMPERE 32
-#define NWARPS_Q8_0_AMPERE 4
-#else
 #define  MMQ_X_Q8_0_AMPERE 128
 #define  MMQ_Y_Q8_0_AMPERE 64
 #define NWARPS_Q8_0_AMPERE 4
-#endif
 #define  MMQ_X_Q8_0_PASCAL 64
 #define  MMQ_Y_Q8_0_PASCAL 64
 #define NWARPS_Q8_0_PASCAL 8
@@ -3908,15 +3859,9 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q2_K_RDNA1  128
 #define  MMQ_Y_Q2_K_RDNA1  32
 #define NWARPS_Q2_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q2_K_AMPERE 4
-#define  MMQ_Y_Q2_K_AMPERE 32
-#define NWARPS_Q2_K_AMPERE 4
-#else
 #define  MMQ_X_Q2_K_AMPERE 64
 #define  MMQ_Y_Q2_K_AMPERE 128
 #define NWARPS_Q2_K_AMPERE 4
-#endif
 #define  MMQ_X_Q2_K_PASCAL 64
 #define  MMQ_Y_Q2_K_PASCAL 64
 #define NWARPS_Q2_K_PASCAL 8
@@ -3975,15 +3920,9 @@ mul_mat_q2_K(
 #define  MMQ_X_Q3_K_RDNA1  32
 #define  MMQ_Y_Q3_K_RDNA1  128
 #define NWARPS_Q3_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q3_K_AMPERE 4
-#define  MMQ_Y_Q3_K_AMPERE 32
-#define NWARPS_Q3_K_AMPERE 4
-#else
 #define  MMQ_X_Q3_K_AMPERE 128
 #define  MMQ_Y_Q3_K_AMPERE 128
 #define NWARPS_Q3_K_AMPERE 4
-#endif
 #define  MMQ_X_Q3_K_PASCAL 64
 #define  MMQ_Y_Q3_K_PASCAL 64
 #define NWARPS_Q3_K_PASCAL 8
@@ -4044,15 +3983,9 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q4_K_RDNA1  32
 #define  MMQ_Y_Q4_K_RDNA1  64
 #define NWARPS_Q4_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_K_AMPERE 4
-#define  MMQ_Y_Q4_K_AMPERE 32
-#define NWARPS_Q4_K_AMPERE 4
-#else
 #define  MMQ_X_Q4_K_AMPERE 64
 #define  MMQ_Y_Q4_K_AMPERE 128
 #define NWARPS_Q4_K_AMPERE 4
-#endif
 #define  MMQ_X_Q4_K_PASCAL 64
 #define  MMQ_Y_Q4_K_PASCAL 64
 #define NWARPS_Q4_K_PASCAL 8
@@ -4113,15 +4046,9 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q5_K_RDNA1  32
 #define  MMQ_Y_Q5_K_RDNA1  64
 #define NWARPS_Q5_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_K_AMPERE 4
-#define  MMQ_Y_Q5_K_AMPERE 32
-#define NWARPS_Q5_K_AMPERE 4
-#else
 #define  MMQ_X_Q5_K_AMPERE 64
 #define  MMQ_Y_Q5_K_AMPERE 128
 #define NWARPS_Q5_K_AMPERE 4
-#endif
 #define  MMQ_X_Q5_K_PASCAL 64
 #define  MMQ_Y_Q5_K_PASCAL 64
 #define NWARPS_Q5_K_PASCAL 8
@@ -4180,15 +4107,9 @@ mul_mat_q5_K(
 #define  MMQ_X_Q6_K_RDNA1  32
 #define  MMQ_Y_Q6_K_RDNA1  64
 #define NWARPS_Q6_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q6_K_AMPERE 4
-#define  MMQ_Y_Q6_K_AMPERE 32
-#define NWARPS_Q6_K_AMPERE 4
-#else
 #define  MMQ_X_Q6_K_AMPERE 64
 #define  MMQ_Y_Q6_K_AMPERE 64
 #define NWARPS_Q6_K_AMPERE 4
-#endif
 #define  MMQ_X_Q6_K_PASCAL 64
 #define  MMQ_Y_Q6_K_PASCAL 64
 #define NWARPS_Q6_K_PASCAL 8
@@ -4405,13 +4326,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous

    const half * x = (const half *) vx;

-    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
+    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
    const int channel_x = channel / channel_x_divisor;

-    const int nrows_y   = ncols_x;
+    const int nrows_y = ncols_x;
    const int nrows_dst = nrows_x;
-    const int row_dst   = row_x;
+    const int row_dst = row_x;

    const int idst = channel*nrows_dst + row_dst;

@@ -4424,13 +4345,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
            break;
        }

+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
+        const float xi = __half2float(x[ix]);
+
        const int row_y = col_x;

-        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
        const int iy = channel*nrows_y + row_y;

-        const float xi = __half2float(x[ix]);
-
        tmp += xi * y[iy];
    }

@@ -5740,21 +5661,11 @@ void ggml_init_cublas() {
        CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
        int64_t total_vram = 0;
-#if defined(GGML_CUDA_FORCE_MMQ)
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
-#endif
-#if defined(CUDA_USE_TENSOR_CORES)
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
-#endif
        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-        for (int id = 0; id < g_device_count; ++id) {
+        for (int64_t id = 0; id < g_device_count; ++id) {
            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
+            fprintf(stderr, "  Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);

            g_tensor_split[id] = total_vram;
            total_vram += prop.totalGlobalMem;
@@ -5764,15 +5675,15 @@ void ggml_init_cublas() {
            g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        }
-        for (int id = 0; id < g_device_count; ++id) {
+        for (int64_t id = 0; id < g_device_count; ++id) {
            g_tensor_split[id] /= total_vram;
        }

-        for (int id = 0; id < g_device_count; ++id) {
+        for (int64_t id = 0; id < g_device_count; ++id) {
            CUDA_CHECK(ggml_cuda_set_device(id));

            // create cuda streams
-            for (int is = 0; is < MAX_STREAMS; ++is) {
+            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
                CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
            }

@@ -6341,15 +6252,16 @@ inline void ggml_cuda_op_mul_mat_cublas(
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, const cudaStream_t & stream) {

-    GGML_ASSERT(src0_dd_i  != nullptr);
+    GGML_ASSERT(src0_dd_i != nullptr);
    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i   != nullptr);
+    GGML_ASSERT(dst_dd_i != nullptr);
+

    const int64_t ne00 = src0->ne[0];
+
    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];
-
    const int64_t row_diff = row_high - row_low;

    int id;
@@ -6434,7 +6346,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                    row_diff, src1_ncols, ne10,
                    &alpha, src0_ddf_i, ne00,
-                            src1_ddf_i, ne10,
+                            src1_ddf_i,  ne10,
                    &beta,  dst_dd_i,   ldc));

        if (src0_as != 0) {
@@ -7101,8 +7013,7 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
 }

 static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
+    GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -7112,11 +7023,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];

+    const int64_t ne12 = src1->ne[2];
+
    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];

-    const int64_t ne12 = src1->ne[2];
-
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

@@ -7135,200 +7046,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }

-static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-    half * src0_as_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
-
-    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const half alpha_f16 = 1.0f;
-    const half beta_f16  = 0.0f;
-
-#if 0
-    // use cublasGemmEx
-    {
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                CUBLAS_CHECK(
-                        cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F, nb01/sizeof(half),
-                                        (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
-                            &beta_f16,  (      char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
-                            CUBLAS_COMPUTE_16F,
-                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-            }
-        }
-    }
-#else
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
-        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-        // use cublasGemmStridedBatchedEx
-        CUBLAS_CHECK(
-        cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-                ne01, ne11, ne10,
-                &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
-                            (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
-                &beta_f16,  (      char *)     dst_f16, CUDA_R_16F, ne01,                dst->nb[2]/sizeof(float), // strideC
-                ne12*ne13,
-                CUBLAS_COMPUTE_16F,
-                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    } else {
-        // use cublasGemmBatchedEx
-        // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
-        const int ne23 = ne12*ne13;
-
-        // TODO: avoid this alloc
-        void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
-
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3];
-                ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
-                ptrs[2*ne23 + i12 + i13*ne12] = (char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
-            }
-        }
-
-        // allocate device memory for pointers
-        void ** ptrs_as = nullptr;
-        CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
-
-        // TODO: this does not work for some reason -- not sure why?
-        //size_t ptrs_s = 0;
-        //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
-
-        // copy pointers to device
-        CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
-
-        free(ptrs);
-
-        CUBLAS_CHECK(
-        cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-                ne01, ne11, ne10,
-                &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
-                            (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
-                &beta_f16,  (      void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
-                ne23,
-                CUBLAS_COMPUTE_16F,
-                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-        // free device memory for pointers
-        CUDA_CHECK(cudaFree(ptrs_as));
-        //ggml_cuda_pool_free(ptrs_as, ptrs_s);
-    }
-#endif
-
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    ggml_cuda_pool_free(dst_f16, dst_as);
-}
-
 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU) &&
-        (src1->backend == GGML_BACKEND_GPU) &&
-        ( dst->backend == GGML_BACKEND_GPU);
+    bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
+        src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;

    int64_t min_compute_capability = INT_MAX;
    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+        if (min_compute_capability > g_compute_capabilities[id]
+                && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
            min_compute_capability = g_compute_capabilities[id];
        }
    }

-#ifdef CUDA_USE_TENSOR_CORES
-    const bool use_tensor_cores = true;
-#else
-    const bool use_tensor_cores = false;
-#endif
-
-    // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-
-    if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ single-batch
+    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        // KQV single-batch
+    } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
-        // KQ + KQV multi-batch
-        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
    } else if (src0->type == GGML_TYPE_F32) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+
 #ifdef GGML_CUDA_FORCE_DMMV
            const bool use_mul_mat_vec_q = false;
 #else
@@ -7341,15 +7079,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
            }
        } else {
-            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
-
-            // when tensor cores are available, use them for large batch size
-            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
-                use_mul_mat_q = false;
-            }
-
-            if (use_mul_mat_q) {
+            if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
            } else {
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7703,6 +7433,10 @@ void ggml_cuda_set_main_device(const int main_device) {
    }
 }

+void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
+    g_mul_mat_q = mul_mat_q;
+}
+
 void ggml_cuda_set_scratch_size(const size_t scratch_size) {
    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
    // it still won't always work as expected, but it's better than nothing
@@ -62,7 +62,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul);
    GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
    GGML_METAL_DECL_KERNEL(scale);
-    GGML_METAL_DECL_KERNEL(scale_4);
    GGML_METAL_DECL_KERNEL(silu);
    GGML_METAL_DECL_KERNEL(relu);
    GGML_METAL_DECL_KERNEL(gelu);
@@ -250,7 +249,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul);
        GGML_METAL_ADD_KERNEL(mul_row);
        GGML_METAL_ADD_KERNEL(scale);
-        GGML_METAL_ADD_KERNEL(scale_4);
        GGML_METAL_ADD_KERNEL(silu);
        GGML_METAL_ADD_KERNEL(relu);
        GGML_METAL_ADD_KERNEL(gelu);
@@ -349,7 +347,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul);
    GGML_METAL_DEL_KERNEL(mul_row);
    GGML_METAL_DEL_KERNEL(scale);
-    GGML_METAL_DEL_KERNEL(scale_4);
    GGML_METAL_DEL_KERNEL(silu);
    GGML_METAL_DEL_KERNEL(relu);
    GGML_METAL_DEL_KERNEL(gelu);
@@ -926,20 +923,15 @@ void ggml_metal_graph_compute(

                            const float scale = *(const float *) src1->data;

-                            int64_t n = ggml_nelements(dst);
-
-                            if (n % 4 == 0) {
-                                n /= 4;
-                                [encoder setComputePipelineState:ctx->pipeline_scale_4];
-                            } else {
-                                [encoder setComputePipelineState:ctx->pipeline_scale];
-                            }
-
+                            [encoder setComputePipelineState:ctx->pipeline_scale];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];

-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            const int64_t n = ggml_nelements(dst);
+                            GGML_ASSERT(n % 4 == 0);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_UNARY:
                        switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -125,17 +125,9 @@ kernel void kernel_mul_row(
 }

 kernel void kernel_scale(
-        device const float * src0,
-        device       float * dst,
-        constant     float & scale,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale;
-}
-
-kernel void kernel_scale_4(
        device const float4 * src0,
        device       float4 * dst,
-        constant     float  & scale,
+        constant     float & scale,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] * scale;
 }
@@ -571,6 +571,7 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif

+
 //
 // cache line
 //
@@ -1827,6 +1828,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
    return type_traits[type];
 }

+
 //
 // simd mappings
 //
@@ -4055,17 +4057,16 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "ALIBI",
    "CLAMP",
    "CONV_1D",
-    "CONV_1D_STAGE_0",
-    "CONV_1D_STAGE_1",
    "CONV_TRANSPOSE_1D",
    "CONV_2D",
-    "CONV_2D_STAGE_0",
-    "CONV_2D_STAGE_1",
    "CONV_TRANSPOSE_2D",
    "POOL_1D",
    "POOL_2D",
    "UPSCALE",

+    "CONV_1D_STAGE_0",
+    "CONV_1D_STAGE_1",
+
    "FLASH_ATTN",
    "FLASH_FF",
    "FLASH_ATTN_BACK",
@@ -4091,7 +4092,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };

-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@@ -4142,17 +4143,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "alibi(x)",
    "clamp(x)",
    "conv_1d(x)",
-    "conv_1d_stage_0(x)",
-    "conv_1d_stage_1(x)",
    "conv_transpose_1d(x)",
    "conv_2d(x)",
-    "conv_2d_stage_0(x)",
-    "conv_2d_stage_1(x)",
    "conv_transpose_2d(x)",
    "pool_1d(x)",
    "pool_2d(x)",
    "upscale(x)",

+    "conv_1d_stage_0(x)",
+    "conv_1d_stage_1(x)",
+
    "flash_attn(x)",
    "flash_ff(x)",
    "flash_attn_back(x)",
@@ -4178,7 +4178,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };

-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@@ -4209,10 +4209,8 @@ static void ggml_setup_op_has_task_pass(void) {
        p[GGML_OP_CONV_1D                ] = true;
        p[GGML_OP_CONV_1D_STAGE_0        ] = true;
        p[GGML_OP_CONV_1D_STAGE_1        ] = true;
-        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
        p[GGML_OP_CONV_2D                ] = true;
-        p[GGML_OP_CONV_2D_STAGE_0        ] = true;
-        p[GGML_OP_CONV_2D_STAGE_1        ] = true;
+        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -5956,6 +5954,7 @@ struct ggml_tensor * ggml_sqrt_inplace(
    return ggml_sqrt_impl(ctx, a, true);
 }

+
 // ggml_log

 static struct ggml_tensor * ggml_log_impl(
@@ -6009,6 +6008,7 @@ struct ggml_tensor * ggml_sum(
    return result;
 }

+
 // ggml_sum_rows

 struct ggml_tensor * ggml_sum_rows(
@@ -6640,6 +6640,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
 }

+
 // ggml_cpy

 static struct ggml_tensor * ggml_cpy_impl(
@@ -6719,6 +6720,7 @@ struct ggml_tensor * ggml_cont_inplace(
    return ggml_cont_impl(ctx, a, true);
 }

+
 // make contiguous, with new shape
 GGML_API struct ggml_tensor * ggml_cont_1d(
        struct ggml_context * ctx,
@@ -7171,6 +7173,7 @@ struct ggml_tensor * ggml_diag(
    return result;
 }

+
 // ggml_diag_mask_inf

 static struct ggml_tensor * ggml_diag_mask_inf_impl(
@@ -7282,6 +7285,7 @@ struct ggml_tensor * ggml_soft_max_inplace(
    return ggml_soft_max_impl(ctx, a, true);
 }

+
 // ggml_soft_max_back

 static struct ggml_tensor * ggml_soft_max_back_impl(
@@ -7698,11 +7702,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(

 // ggml_conv_2d

-// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OH, OW, IC*KH*KW]
-static struct ggml_tensor * ggml_conv_2d_stage_0(
+struct ggml_tensor * ggml_conv_2d(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b,
@@ -7721,21 +7721,17 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
        is_node = true;
    }

-    const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
-    const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-
    const int64_t ne[4] = {
-        a->ne[2] * a->ne[1] * a->ne[0],
-        OW,
-        OH,
-        b->ne[3],
+        ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+        ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
+        a->ne[3], b->ne[3],
    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
    ggml_set_op_params(result, params, sizeof(params));

-    result->op = GGML_OP_CONV_2D_STAGE_0;
+    result->op = GGML_OP_CONV_2D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;
@@ -7744,61 +7740,8 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(

 }

-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// a: [OC, IC, KH, KW]
-// b: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static struct ggml_tensor * ggml_conv_2d_stage_1(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b) {
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        b->ne[1],
-        b->ne[2],
-        a->ne[3],
-        b->ne[3],
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op = GGML_OP_CONV_2D_STAGE_1;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-
-}
-
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OC, OH, OW]
-struct ggml_tensor * ggml_conv_2d(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1) {
-
-    struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
-    result = ggml_conv_2d_stage_1(ctx, a, result);
-
-    return result;
-
-}
-
 // ggml_conv_2d_sk_p0
+
 struct ggml_tensor * ggml_conv_2d_sk_p0(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
@@ -8237,6 +8180,7 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
    return result;
 }

+
 struct ggml_tensor * ggml_add_rel_pos(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
@@ -8681,6 +8625,8 @@ struct ggml_tensor * ggml_map_custom3_inplace(
    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
 }

+
+
 // ggml_cross_entropy_loss

 struct ggml_tensor * ggml_cross_entropy_loss(
@@ -9882,6 +9828,7 @@ static void ggml_compute_forward_add1(
    }
 }

+
 // ggml_compute_forward_acc

 static void ggml_compute_forward_acc_f32(
@@ -10021,6 +9968,7 @@ static void ggml_compute_forward_sub_f32(
            const int i2 = (ir - i3*ne2*ne1)/ne1;
            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

+
 #ifdef GGML_USE_ACCELERATE
            vDSP_vsub(
                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
@@ -10201,6 +10149,7 @@ static void ggml_compute_forward_div_f32(
            const int i2 = (ir - i3*ne2*ne1)/ne1;
            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

+
 #ifdef GGML_USE_ACCELERATE
            UNUSED(ggml_vec_div_f32);

@@ -10338,6 +10287,7 @@ static void ggml_compute_forward_sqrt(
    }
 }

+
 // ggml_compute_forward_log

 static void ggml_compute_forward_log_f32(
@@ -12170,6 +12120,7 @@ static void ggml_compute_forward_out_prod_f32(
        }
    }

+
    //int64_t t1 = ggml_perf_time_us();
    //static int64_t acc = 0;
    //acc += t1 - t0;
@@ -12365,6 +12316,7 @@ static void ggml_compute_forward_scale_f32(

    const size_t nb1 = dst->nb[1];

+
    for (int i1 = ir0; i1 < ir1; i1++) {
        if (dst->data != src0->data) {
            // src0 is same shape as dst => same indices
@@ -12762,6 +12714,7 @@ static void ggml_compute_forward_get_rows_back_f32(
    }
 }

+
 static void ggml_compute_forward_get_rows_back(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@@ -14044,7 +13997,6 @@ static void ggml_compute_forward_conv_1d_f32(
    }
 }

-// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
 static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
                             ggml_fp16_t * A,
                             ggml_fp16_t * B,
@@ -14346,9 +14298,6 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
            }
        }

-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, ggml_nbytes(dst));
-
        return;
    }

@@ -14421,7 +14370,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
                    float * dst_data = wdata + i01*ne00*ne02;
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
+                        dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
                    }
                }
            }
@@ -14440,9 +14389,6 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
            }
        }

-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, ggml_nbytes(dst));
-
        return;
    }

@@ -14504,144 +14450,6 @@ static void ggml_compute_forward_conv_transpose_1d(

 // ggml_compute_forward_conv_2d

-// src0: kernel [OC, IC, KH, KW]
-// src1: image [N, IC, IH, IW]
-// dst:  result [N, OH, OW, IC*KH*KW]
-static void ggml_compute_forward_conv_2d_stage_0_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int64_t N = ne13;
-    const int64_t IC = ne12;
-    const int64_t IH = ne11;
-    const int64_t IW = ne10;
-
-    // const int64_t OC = ne03;
-    // const int64_t IC = ne02;
-    const int64_t KH = ne01;
-    const int64_t KW = ne00;
-
-    const int64_t OH = ne2;
-    const int64_t OW = ne1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(dst->data, 0, ggml_nbytes(dst));
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) {
-                for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic+=nth) {
-
-                        // micro kernel
-                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
-
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
-
-                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// src0: [OC, IC, KH, KW]
-// src1: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static void ggml_compute_forward_conv_2d_stage_1_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb0  == sizeof(float));
-
-    const int N = ne13;
-    const int OH = ne12;
-    const int OW = ne11;
-
-    const int OC = ne03;
-    const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
-
-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    for (int i = 0; i < N; i++) {
-        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
-        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
-
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-
 static void ggml_compute_forward_conv_2d_f16_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@@ -14654,40 +14462,16 @@ static void ggml_compute_forward_conv_2d_f16_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    // src1: image [N, IC, IH, IW]
-    // src0: kernel [OC, IC, KH, KW]
-    // dst:  result [N, OC, OH, OW]
-    // ne12: IC
-    // ne0: OW
-    // ne1: OH
-    // nk0: KW
-    // nk1: KH
-    // ne13: N
-
-    const int N = ne13;
-    const int IC = ne12;
-    const int IH = ne11;
-    const int IW = ne10;
-
-    const int OC = ne03;
-    // const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
-
-    const int OH = ne1;
-    const int OW = ne0;
+    GGML_TENSOR_BINARY_OP_LOCALS;

    const int ith = params->ith;
    const int nth = params->nth;

-    // const int nk0 = ne00;
-    // const int nk1 = ne01;
+    const int nk0 = ne00;
+    const int nk1 = ne01;

    // size of the convolution row - the kernel size unrolled across all channels
-    // const int ew0 = nk0*nk1*ne02;
-    // ew0: IC*KH*KW
+    const int ew0 = nk0*nk1*ne02;

    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@@ -14703,27 +14487,24 @@ static void ggml_compute_forward_conv_2d_f16_f32(
        memset(params->wdata, 0, params->wsize);

        // prepare source data (src1)
-        // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
-
        {
            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;

-            for (int in = 0; in < N; in++) {
-                for (int iic = 0; iic < IC; iic++) {
-                    for (int ioh = 0; ioh < OH; ioh++) {
-                        for (int iow = 0; iow < OW; iow++) {
+            for (int i13 = 0; i13 < ne13; i13++) {
+                for (int i12 = 0; i12 < ne12; i12++) {
+                    const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
+                    ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);

-                            // micro kernel
-                            ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+                    for (int i1 = 0; i1 < ne1; i1++) {
+                        for (int i0 = 0; i0 < ne0; i0++) {
+                            for (int ik1 = 0; ik1 < nk1; ik1++) {
+                                for (int ik0 = 0; ik0 < nk0; ik0++) {
+                                    const int idx0 = i0*s0 + ik0*d0 - p0;
+                                    const int idx1 = i1*s1 + ik1*d1 - p1;

-                            for (int ikh = 0; ikh < KH; ikh++) {
-                                for (int ikw = 0; ikw < KW; ikw++) {
-                                    const int iiw = iow*s0 + ikw*d0 - p0;
-                                    const int iih = ioh*s1 + ikh*d1 - p1;
-
-                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                    if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
+                                        dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+                                            GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
                                    }
                                }
                            }
@@ -14740,22 +14521,30 @@ static void ggml_compute_forward_conv_2d_f16_f32(
        return;
    }

+    // total patches in dst
+    const int np = ne2;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-    // wdata: [N*OH*OW, IC*KH*KW]
-    // dst: result [N, OC, OH, OW]
-    // src0: kernel [OC, IC, KH, KW]

-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = ip0; i2 < ip1; i2++) {
+            float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);

-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    for (int i = 0; i < N; i++) {
-        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
-        ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m * k]
-
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+            for (int i1 = 0; i1 < ne1; ++i1) {
+                for (int i0 = 0; i0 < ne0; ++i0) {
+                    ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
+                            (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
+                            (ggml_fp16_t *)                wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
+                }
+            }
+        }
    }
 }

@@ -14781,48 +14570,6 @@ static void ggml_compute_forward_conv_2d(
    }
 }

-static void ggml_compute_forward_conv_2d_stage_0(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_conv_2d_stage_1(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_conv_transpose_2d

 static void ggml_compute_forward_conv_transpose_2d(
@@ -14881,8 +14628,6 @@ static void ggml_compute_forward_conv_transpose_2d(
            }
        }

-        memset(dst->data, 0, ggml_nbytes(dst));
-
        return;
    }

@@ -16381,6 +16126,7 @@ static void ggml_compute_forward_add_rel_pos_f32(
    const int ip0 = dp*ith;
    const int ip1 = MIN(ip0 + dp, np);

+
    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
        for (int64_t i12 = 0; i12 < ne12; ++i12) {
            for (int64_t i11 = 0; i11 < ne11; ++i11) {
@@ -16447,6 +16193,7 @@ static void ggml_compute_forward_map_unary_f32(
    }
 }

+
 static void ggml_compute_forward_map_unary(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@@ -16494,6 +16241,7 @@ static void ggml_compute_forward_map_binary_f32(
    }
 }

+
 static void ggml_compute_forward_map_binary(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@@ -16545,6 +16293,7 @@ static void ggml_compute_forward_map_custom2_f32(
    fun(dst, a, b);
 }

+
 // ggml_compute_forward_map_custom3

 static void ggml_compute_forward_map_custom3_f32(
@@ -16819,6 +16568,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
        ggml_vec_sub_f32(nc, ds0, ds0, s1);
        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);

+
 #ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            assert(!isnan(ds0[i]));
@@ -16846,15 +16596,12 @@ static void ggml_compute_forward_cross_entropy_loss_back(
    }
 }

+
 /////////////////////////////////

 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    GGML_ASSERT(params);

-    if (tensor->op == GGML_OP_NONE) {
-        return;
-    }
-
 #ifdef GGML_USE_CUBLAS
    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
    if (skip_cpu) {
@@ -17057,14 +16804,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
            } break;
-        case GGML_OP_CONV_2D_STAGE_0:
-            {
-                ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_CONV_2D_STAGE_1:
-            {
-                ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
                ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@@ -17994,19 +17733,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
        case GGML_OP_CONV_2D:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
-        case GGML_OP_CONV_2D_STAGE_0:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_2D_STAGE_1:
+        case GGML_OP_CONV_TRANSPOSE_1D:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
@@ -18935,7 +18666,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    const int64_t ne0 = node->ne[0];
                    const int64_t ne1 = node->ne[1];
                    const int64_t ne2 = node->ne[2];
-                    const int64_t ne3 = node->ne[3];
                    const int64_t nk = ne00*ne01;
                    const int64_t ew0 = nk * ne02;

@@ -18946,8 +18676,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {

                    if (node->src[0]->type == GGML_TYPE_F16 &&
                        node->src[1]->type == GGML_TYPE_F32) {
-                        // im2col: [N*OH*OW, IC*KH*KW]
-                        cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
+                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
                               node->src[1]->type == GGML_TYPE_F32) {
                        cur = sizeof(float)*      (ne10*ne11*ne12);
@@ -18957,14 +18686,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {

                    work_size = MAX(work_size, cur);
                } break;
-            case GGML_OP_CONV_2D_STAGE_0:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case GGML_OP_CONV_2D_STAGE_1:
-                {
-                    n_tasks = n_threads;
-                } break;
            case GGML_OP_CONV_TRANSPOSE_2D:
                {
                    n_tasks = n_threads;
@@ -20153,6 +19874,7 @@ static enum ggml_opt_result ggml_opt_adam(

        opt->loss_after = fx;

+
        // check convergence
        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
            GGML_PRINT_DEBUG("converged\n");
@@ -401,16 +401,15 @@ extern "C" {
        GGML_OP_ALIBI,
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
-        GGML_OP_CONV_1D_STAGE_0,  // internal
-        GGML_OP_CONV_1D_STAGE_1,  // internal
-        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_CONV_2D,
-        GGML_OP_CONV_2D_STAGE_0, // internal
-        GGML_OP_CONV_2D_STAGE_1, // internal
+        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,

+        GGML_OP_CONV_1D_STAGE_0,  // internal
+        GGML_OP_CONV_1D_STAGE_1,  // internal
+
        GGML_OP_UPSCALE, // nearest interpolate

        GGML_OP_FLASH_ATTN,
@@ -1021,9 +1020,9 @@ extern "C" {
            struct ggml_tensor  * b,
            float                 eps);

-    // A: k columns, n rows => [ne03, ne02, n, k]
-    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
-    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
+    // A: n columns, m rows
+    // B: n columns, p rows  (i.e. we transpose it internally)
+    // result is m columns, p rows
    GGML_API struct ggml_tensor * ggml_mul_mat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -987,15 +987,12 @@ class SpecialVocab:
    merges: list[str] = []
    special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
    special_token_ids: dict[str, int] = {}
-    n_vocab: int | None = None

    def __init__(
        self, path: str | os.PathLike[str], load_merges: bool = False,
        special_token_types: tuple[str, ...] | None = None,
-        n_vocab: int | None = None,
    ):
        self.special_token_ids = {}
-        self.n_vocab = n_vocab
        self.load_merges = load_merges
        if special_token_types is not None:
            self.special_token_types = special_token_types
@@ -1005,16 +1002,6 @@ class SpecialVocab:
        if not self._try_load_from_tokenizer_json(path):
            self._try_load_from_config_json(path)

-    def _set_special_token(self, typ: str, tid: Any):
-        if not isinstance(tid, int) or tid < 0:
-            return
-        if self.n_vocab is None or tid < self.n_vocab:
-            self.special_token_ids[typ] = tid
-            return
-        print(f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
-            file = sys.stderr)
-
-
    def _try_load_from_tokenizer_json(self, path: Path) -> bool:
        tokenizer_file = path / 'tokenizer.json'
        if not tokenizer_file.is_file():
@@ -1042,11 +1029,10 @@ class SpecialVocab:
                tc_content = entry_content
            else:
                continue
-            # We only need the first match here.
-            maybe_token_id = next((
-                atok.get('id') for atok in added_tokens
-                if atok.get('content') == tc_content), None)
-            self._set_special_token(typ, maybe_token_id)
+            for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
+                if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
+                    self.special_token_ids[typ] = maybe_token_id
+                break
        return True

    def _try_load_from_config_json(self, path: Path) -> bool:
@@ -1056,21 +1042,21 @@ class SpecialVocab:
        with open(config_file, encoding = 'utf-8') as f:
            config = json.load(f)
        for typ in self.special_token_types:
-            self._set_special_token(typ, config.get(f'{typ}_token_id'))
+            maybe_token_id = config.get(f'{typ}_token_id')
+            if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
+                self.special_token_ids[typ] = maybe_token_id
        return True

-    def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
+    def add_to_gguf(self, gw: GGUFWriter) -> None:
        if len(self.merges) > 0:
-            if not quiet:
-                print(f'gguf: Adding {len(self.merges)} merge(s).')
+            print(f'gguf: Adding {len(self.merges)} merge(s).')
            gw.add_token_merges(self.merges)
        for typ, tokid in self.special_token_ids.items():
            handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
            if handler is None:
-                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr)
+                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
                continue
-            if not quiet:
-                print(f'gguf: Setting special token type {typ} to {tokid}')
+            print(f'gguf: Setting special token type {typ} to {tokid}')
            handler(tokid)

    def __repr__(self) -> str:
@@ -975,15 +975,14 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
    (void) tensor;
 }

-static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
-    }
-    else {
+    } else {
        result.resize(n_tokens);
    }

@@ -1203,10 +1202,10 @@ struct llama_vocab {
    id special_eot_id    = 32010;

    int find_bpe_rank(std::string token_left, std::string token_right) const {
-        GGML_ASSERT(token_left.find(" ") == std::string::npos);
-        GGML_ASSERT(token_left.find("\n") == std::string::npos);
-        GGML_ASSERT(token_right.find(" ") == std::string::npos);
-        GGML_ASSERT(token_right.find("\n") == std::string::npos);
+        replace_all(token_left,  " ",  "\u0120");
+        replace_all(token_left,  "\n", "\u010A");
+        replace_all(token_right, " ",  "\u0120");
+        replace_all(token_right, "\n", "\u010A");

        auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
        if (it == bpe_ranks.end()) {
@@ -1578,14 +1577,12 @@ static void llama_kv_cache_seq_shift(
 enum llama_fver {
    GGUF_FILE_VERSION_V1 = 1,
    GGUF_FILE_VERSION_V2 = 2,
-    GGUF_FILE_VERSION_V3 = 3,
 };

 static const char * llama_file_version_name(llama_fver version) {
    switch (version) {
        case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
-        case GGUF_FILE_VERSION_V2: return "GGUF V2";
-        case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
+        case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
    }

    return "unknown";
@@ -2241,35 +2238,15 @@ static void llm_load_vocab(
    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
    } else {
-        const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
-        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
-        vocab.linefeed_id = ids[0];
+        vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
    }

    // special tokens
-    {
-        const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
-            { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
-            { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
-            { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
-            { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
-            { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
-        };
-        for (const auto & it : special_token_types) {
-            const std::string & key = kv(std::get<0>(it));
-            int32_t & id = std::get<1>(it), old_id = id;
-
-            GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
-            // Must be >= -1 and < vocab size. Since the key is unsigned, -1
-            // can only come from the default value, so there's no point in
-            // validating that.
-            if (size_t(id + 1) > vocab.id_to_token.size()) {
-                LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
-                    __func__, key.c_str(), id, old_id);
-                id = old_id;
-            }
-        }
-    }
+    GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
+    GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
+    GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
+    GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
+    GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));

    // build special tokens cache
    {
@@ -2695,8 +2672,8 @@ static void llm_load_tensors(
                } break;
            case LLM_ARCH_STARCODER:
                {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
-                    model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);

                    // output
                    {
@@ -2747,19 +2724,19 @@ static void llm_load_tensors(
                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);

                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);

                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
+                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend_split);

                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);

                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);

                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
+                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);

                        if (backend == GGML_BACKEND_GPU) {
                            vram_weights +=
@@ -4616,8 +4593,6 @@ static struct ggml_cgraph * llm_build_starcoder(

    const float norm_eps = hparams.f_norm_eps;

-    const int n_gpu_layers = model.n_gpu_layers;
-
    const int32_t n_tokens = batch.n_tokens;
    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
@@ -4662,27 +4637,6 @@ static struct ggml_cgraph * llm_build_starcoder(
        }
    }

-    const int i_gpu_start = n_layer - n_gpu_layers;
-    (void) i_gpu_start;
-
-    // offload functions set the tensor output backend to GPU
-    // tensors are GPU-accelerated if any input or the output has been offloaded
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > n_layer) {
-        offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 1) {
-        offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 2) {
-        offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
-    }
-#endif // GGML_USE_CUBLAS
-
    {
        // Compute position embeddings.
        struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -4708,7 +4662,6 @@ static struct ggml_cgraph * llm_build_starcoder(
    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
    ggml_set_name(KQ_mask, "KQ_mask");
-    offload_func_kq(KQ_mask);
    ggml_allocr_alloc(lctx.alloc, KQ_mask);
    if (!ggml_allocr_is_measure(lctx.alloc)) {
        float * data = (float *) KQ_mask->data;
@@ -4732,67 +4685,44 @@ static struct ggml_cgraph * llm_build_starcoder(
    ggml_set_name(inpL, "inpL");

    for (int il = 0; il < n_layer; ++il) {
-        offload_func_t offload_func = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-        if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers_no_alloc;
-        }
-#endif // GGML_USE_CUBLAS
-
        {
            // Norm
            cur = ggml_norm(ctx0, inpL, norm_eps);
-            offload_func(cur);
-
            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
-            offload_func(cur);
        }

        {
            // Self Attention
-            cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-            offload_func_kq(cur);
+            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);

-            cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-            offload_func_kq(cur);
+            struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
+            struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));

-            struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-            struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-            struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-            ggml_set_name(tmpq, "tmpq");
-            ggml_set_name(tmpk, "tmpk");
-            ggml_set_name(tmpv, "tmpv");
-
-            offload_func_kq(tmpq);
-            offload_func_kq(tmpk);
-            offload_func_v (tmpv);
-
-            struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
+            struct ggml_tensor * Qcur = tmpq;
            struct ggml_tensor * Kcur = tmpk;

            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv);
-                offload_func_v(Vcur);
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
                ggml_set_name(Vcur, "Vcur");

                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                offload_func_kq(k);
                ggml_set_name(k, "k");

                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
                        (   n_ctx)*ggml_element_size(kv_self.v),
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                offload_func_v(v);
-                ggml_set_name(v, "v");

                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
            }

-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            offload_func_kq(Q);
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
+                        0, 2, 1, 3);
            ggml_set_name(Q, "Q");

            struct ggml_tensor * K =
@@ -4801,28 +4731,23 @@ static struct ggml_cgraph * llm_build_starcoder(
                        ggml_element_size(kv_self.k)*n_embd_gqa,
                        ggml_element_size(kv_self.k)*n_embd_head,
                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            offload_func_kq(K);
            ggml_set_name(K, "K");

            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
            ggml_set_name(KQ, "KQ");

            // KQ_scaled = KQ / sqrt(n_embd_head)
            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
-            offload_func_kq(KQ_scaled);
            ggml_set_name(KQ_scaled, "KQ_scaled");

            // KQ_masked = mask_past(KQ_scaled)
            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            offload_func_kq(KQ_masked);
            ggml_set_name(KQ_masked, "KQ_masked");

            // KQ = soft_max(KQ_masked)
            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
            ggml_set_name(KQ_soft_max, "KQ_soft_max");

            // split cached V into n_head heads
@@ -4835,25 +4760,22 @@ static struct ggml_cgraph * llm_build_starcoder(
            ggml_set_name(V, "V");

            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
            ggml_set_name(KQV, "KQV");

+            // KQV_merged = KQV.permute(0, 2, 1, 3)
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
            ggml_set_name(KQV_merged, "KQV_merged");

+            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            offload_func_v(cur);
            ggml_set_name(cur, "KQV_merged_contiguous");
        }

        // Projection
        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
-        offload_func(cur);

        // Add the input
        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);

        struct ggml_tensor * inpFF = cur;

@@ -4862,36 +4784,27 @@ static struct ggml_cgraph * llm_build_starcoder(
            // Norm
            {
                cur = ggml_norm(ctx0, inpFF, norm_eps);
-                offload_func_nr(cur);
-
                cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
-                offload_func_nr(cur);
            }

            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
-            offload_func(cur);

            // GELU activation
            cur = ggml_gelu(ctx0, cur);
-            offload_func(cur);

            // Projection
            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
-            offload_func(cur);
        }

        inpL = ggml_add(ctx0, cur, inpFF);
-
    }

    // Output Norm
    {
        cur = ggml_norm(ctx0, inpL, norm_eps);
-        offload_func_nr(cur);
-
        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
-        ggml_set_name(cur, "result_norm");
    }
+    ggml_set_name(cur, "result_norm");

    cur = ggml_mul_mat(ctx0, model.output, cur);
    ggml_set_name(cur, "result_output");
@@ -6025,6 +5938,8 @@ static int llama_decode_internal(
        }
    }

+    ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
+
    // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
    if (!lctx.embedding.empty()) {
        embeddings->backend = GGML_BACKEND_CPU;
@@ -6188,10 +6103,11 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
 }

 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
-    static const char * hex = "0123456789ABCDEF";
    switch (llama_vocab_get_type(vocab)) {
    case LLAMA_VOCAB_TYPE_SPM: {
-        const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
+        char buf[7];
+        int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
+        GGML_ASSERT(0 <= result && result < 7);
        return vocab.token_to_id.at(buf);
    }
    case LLAMA_VOCAB_TYPE_BPE: {
@@ -7557,14 +7473,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
        }
    }

-    const llama_token eos = llama_token_eos(&ctx->model);
+    const llama_token eos = llama_token_eos(ctx);

    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
    std::vector<llama_grammar_candidate>                              candidates_grammar;

    for (size_t i = 0; i < candidates->size; ++i) {
        const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_piece(ctx, id);
+        const std::string piece = llama_token_to_str(ctx, id);
        if (id == eos) {
            if (!allow_eos) {
                candidates->data[i].logit = -INFINITY;
@@ -7767,7 +7683,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
 void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
    const int64_t t_start_sample_us = ggml_time_us();

-    if (token == llama_token_eos(&ctx->model)) {
+    if (token == llama_token_eos(ctx)) {
        for (const auto & stack : grammar->stacks) {
            if (stack.empty()) {
                return;
@@ -7776,7 +7692,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
        GGML_ASSERT(false);
    }

-    const std::string piece = llama_token_to_piece(ctx, token);
+    const std::string piece = llama_token_to_str(ctx, token);

    // Note terminating 0 in decoded string
    const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
@@ -8049,24 +7965,6 @@ struct no_init {
    no_init() { /* do nothing */ }
 };

-struct quantize_state_internal {
-    const llama_model                 & model;
-    const llama_model_quantize_params * params;
-#ifdef GGML_USE_K_QUANTS
-    int n_attention_wv    = 0;
-    int n_feed_forward_w2 = 0;
-    int i_attention_wv    = 0;
-    int i_feed_forward_w2 = 0;
-
-    int n_k_quantized     = 0;
-    int n_fallback        = 0;
-#endif
-    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
-        {}
-};
-
 static void llama_convert_tensor_internal(
    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
@@ -8127,13 +8025,12 @@ static void llama_convert_tensor_internal(

 #ifdef GGML_USE_K_QUANTS
 static ggml_type get_k_quant_type(
-    quantize_state_internal & qs,
-    ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
+    ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
+    int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
 ) {
    const std::string name = ggml_get_name(tensor);
    // TODO: avoid hardcoded tensor names - use the TN_* constants
-    const llm_arch arch = qs.model.arch;
-    const auto       tn = LLM_TN(arch);
+    const auto tn = LLM_TN(model.arch);

    auto use_more_bits = [](int i_layer, int num_layers) -> bool {
        return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
@@ -8141,7 +8038,7 @@ static ggml_type get_k_quant_type(

    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
        int nx = tensor->ne[0];
-        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
+        if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
            new_type = GGML_TYPE_Q8_0;
        }
        else if (new_type != GGML_TYPE_Q8_0) {
@@ -8150,46 +8047,46 @@ static ggml_type get_k_quant_type(
    } else if (name.find("attn_v.weight") != std::string::npos) {
        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
-                use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+                use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
        else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
-                (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
-        if (qs.model.type == MODEL_70B) {
+                (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
+        if (model.type == MODEL_70B) {
            // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
            // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
            // nearly negligible increase in model size by quantizing this tensor with more bits:
            if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
        }
-        ++qs.i_attention_wv;
+        ++*i_attention_wv;
    } else if (name.find("ffn_down.weight") != std::string::npos) {
        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
-                     : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
+            new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
+                     : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
                     : GGML_TYPE_Q3_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
-            new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
+            new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
-            if (arch == LLM_ARCH_FALCON) {
-                new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
-                           use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            if (model.arch == LLM_ARCH_FALCON) {
+                new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
+                           use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
            } else {
-                if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+                if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
            }
        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
            new_type = GGML_TYPE_Q5_K;
        }
-        ++qs.i_feed_forward_w2;
+        ++*i_feed_forward_w2;
    } else if (name.find("attn_output.weight") != std::string::npos) {
-        if (arch != LLM_ARCH_FALCON) {
+        if (model.arch != LLM_ARCH_FALCON) {
            if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8216,23 +8113,20 @@ static ggml_type get_k_quant_type(
        int nx = tensor->ne[0];
        int ny = tensor->ne[1];
        if (nx % QK_K != 0) {
-            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
+            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
            convert_incompatible_tensor = true;
-        } else {
-            ++qs.n_k_quantized;
        }
    }
    if (convert_incompatible_tensor) {
-        switch (new_type) {
-            case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
-            case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
-            case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
-            case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
-            case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
-            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+        if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+            new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
+            LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
+        } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+            new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
+            LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
+        } else {
+            throw std::runtime_error("Unsupported tensor size encountered\n");
        }
-        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-        ++qs.n_fallback;
    }

    return new_type;
@@ -8290,8 +8184,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    llm_load_arch(ml, model);
    llm_load_hparams(ml, model);

-    struct quantize_state_internal qs(model, params);
-
    if (params->only_copy) {
        ftype = model.ftype;
    }
@@ -8305,6 +8197,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    gguf_set_val_u32(ctx_out, "general.file_type", ftype);

 #ifdef GGML_USE_K_QUANTS
+    int n_attention_wv    = 0;
+    int n_feed_forward_w2 = 0;
+
    for (int i = 0; i < ml.n_tensors; ++i) {
        struct ggml_tensor * meta = ml.get_tensor_meta(i);

@@ -8312,16 +8207,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

        // TODO: avoid hardcoded tensor names - use the TN_* constants
        if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
-            ++qs.n_attention_wv;
+            ++n_attention_wv;
        }
        else if (name.find("ffn_down.weight") != std::string::npos) {
-            ++qs.n_feed_forward_w2;
+            ++n_feed_forward_w2;
        }
    }
-    if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
+    if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
        LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
-                __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
+                __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
    }
+
+    int i_attention_wv = 0;
+    int i_feed_forward_w2 = 0;
 #endif

    size_t total_size_org = 0;
@@ -8388,7 +8286,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        if (quantize) {
            new_type = quantized_type;
 #ifdef GGML_USE_K_QUANTS
-            new_type = get_k_quant_type(qs, new_type, tensor, ftype);
+            new_type = get_k_quant_type(
+                new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
+            );
 #endif
            // If we've decided to quantize to the same type the tensor is already
            // in then there's nothing to do.
@@ -8514,12 +8414,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            LLAMA_LOG_INFO("\n");
        }
    }
-#ifdef GGML_USE_K_QUANTS
-    if (qs.n_fallback > 0) {
-        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
-                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
-    }
-#endif
 }

 static int llama_apply_lora_from_file_internal(
@@ -8998,7 +8892,7 @@ struct llama_context * llama_new_context_with_model(
            // build worst-case graph
            int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
            int n_past = cparams.n_ctx - n_tokens;
-            llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
            ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));

 #ifdef GGML_USE_METAL
@@ -9759,44 +9653,43 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }

-const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
-    return model->vocab.id_to_token[token].text.c_str();
+const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].text.c_str();
 }

-float llama_token_get_score(const struct llama_model * model, llama_token token) {
-    return model->vocab.id_to_token[token].score;
+float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].score;
 }

-llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
-    return model->vocab.id_to_token[token].type;
+llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].type;
 }

-llama_token llama_token_bos(const struct llama_model * model) {
-    return model->vocab.special_bos_id;
+llama_token llama_token_bos(const struct llama_context * ctx) {
+    return ctx->model.vocab.special_bos_id;
 }

-llama_token llama_token_eos(const struct llama_model * model) {
-    return model->vocab.special_eos_id;
+llama_token llama_token_eos(const struct llama_context * ctx) {
+    return ctx->model.vocab.special_eos_id;
 }

-llama_token llama_token_nl(const struct llama_model * model) {
-    return model->vocab.linefeed_id;
+llama_token llama_token_nl(const struct llama_context * ctx) {
+    return ctx->model.vocab.linefeed_id;
+}
+llama_token llama_token_prefix(const struct llama_context * ctx) {
+    return ctx->model.vocab.special_prefix_id;
 }

-llama_token llama_token_prefix(const struct llama_model * model) {
-    return model->vocab.special_prefix_id;
+llama_token llama_token_middle(const struct llama_context * ctx) {
+    return ctx->model.vocab.special_middle_id;
 }

-llama_token llama_token_middle(const struct llama_model * model) {
-    return model->vocab.special_middle_id;
+llama_token llama_token_suffix(const struct llama_context * ctx) {
+    return ctx->model.vocab.special_suffix_id;
 }

-llama_token llama_token_suffix(const struct llama_model * model) {
-    return model->vocab.special_suffix_id;
-}
-
-llama_token llama_token_eot(const struct llama_model * model) {
-    return model->vocab.special_eot_id;
+llama_token llama_token_eot(const struct llama_context * ctx) {
+    return ctx->model.vocab.special_eot_id;
 }

 int llama_tokenize(
@@ -178,7 +178,7 @@ extern "C" {
        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model

        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool embedding;  // embedding mode only
@@ -494,22 +494,21 @@ extern "C" {
    // Vocab
    //

-    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
+    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);

-    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
+    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);

-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);

    // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
-    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
-
+    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
+    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
    // codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
+    LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
+    LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
+    LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
+    LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle

    //
    // Tokenization
@@ -658,7 +657,6 @@ extern "C" {
                           float * mu);

    /// @details Selects the token with the highest probability.
-    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
    LLAMA_API llama_token llama_sample_token_greedy(
            struct llama_context * ctx,
          llama_token_data_array * candidates);
@@ -28,14 +28,9 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
 llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_build_executable(test-tokenizer-1-llama.cpp)
 llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 llama_build_executable(test-tokenizer-1-bpe.cpp)
 llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
@@ -91,19 +91,9 @@ int main(int argc, char **argv) {
            }
        }
    }
-    // Restrict to assigned unicode planes
+    // TODO: why doesn't this work for the full range of Unicodes?
    // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-    for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
-        }
-    }
-    for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
+    for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
        std::string str = codepoint_to_utf8(cp);
        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
        std::string check = llama_detokenize_bpe(ctx, tokens);
@@ -113,6 +103,7 @@ int main(int argc, char **argv) {
            return 4;
        }
    }
+
    llama_free_model(model);
    llama_free(ctx);
Author	SHA1	Message	Date
Georgi Gerganov	c0f4d54870	server : add comment about changing slot_state to bool	2023-10-22 22:24:39 +03:00
Georgi Gerganov	83e1490187	server : fix slot reuse	2023-10-22 21:57:23 +03:00
Georgi Gerganov	8fe7ca4875	server : apply fix from #3722	2023-10-22 21:05:45 +03:00
Georgi Gerganov	00ae55b388	server : hide ctx_sampling->prev behind API (#3696 )	2023-10-22 20:09:25 +03:00
M. Yusuf Sarıgöz	3d6a687f1d	Update readme to document multimodal in server	2023-10-22 20:03:35 +03:00
Georgi Gerganov	dd1af2ed35	server : minor style	2023-10-22 19:52:50 +03:00
M. Yusuf Sarıgöz	a4d69d8b81	Merge branch 'server-rev' of https://github.com//ggerganov/llama.cpp into server-rev	2023-10-22 19:49:48 +03:00
M. Yusuf Sarıgöz	2679c432d5	Update readme to document multimodal in server	2023-10-22 19:49:33 +03:00
Georgi Gerganov	a8063171bd	server : completion requests remember slot_id	2023-10-22 19:34:48 +03:00
Georgi Gerganov	f305d6434f	editorconfig : new line in index.html	2023-10-22 19:10:30 +03:00
M. Yusuf Sarıgöz	5359fb9267	Do not save/load image_data to localStorage	2023-10-22 19:08:09 +03:00
Georgi Gerganov	f67d971344	server : bug fix for prompt caching	2023-10-22 17:52:59 +03:00
Georgi Gerganov	569ebf11cf	server : refactor ctx_sampling init + n_ctx + names	2023-10-22 16:57:05 +03:00
Georgi Gerganov	ef18f4d579	server : fix crash in Debug on macOS (I have no idea why this fixes it!?)	2023-10-22 16:55:40 +03:00
Georgi Gerganov	197a0a9e23	server : fix switch fallthrough	2023-10-22 16:55:05 +03:00
Georgi Gerganov	715f384a6b	clip : link to ggml, not to llama	2023-10-22 16:52:12 +03:00
Georgi Gerganov	4b4ab722ab	make : silence stb warnings	2023-10-22 16:51:59 +03:00
Georgi Gerganov	176993c871	Merge branch 'master' into server-rev	2023-10-22 15:04:16 +03:00
FSSRepo	2eb4c11ec5	fix image load + view image in chat	2023-10-21 14:34:19 -04:00
Jhen-Jie Hong	17b23eb9cb	server : fix multibyte handle in partial response (#3706 )	2023-10-21 14:58:03 +03:00
Georgi Gerganov	778c070d1b	server : logs + minor code style	2023-10-20 20:44:51 +03:00
Georgi Gerganov	5d540e80d1	server : no need for atomic int - already using mutex	2023-10-20 20:44:29 +03:00
Georgi Gerganov	113dd60005	server : bach has to be allocated for n_parallel sequences	2023-10-20 20:42:45 +03:00
FSSRepo	6b2437e32d	added thread safe pipeline	2023-10-20 12:07:32 -04:00
Georgi Gerganov	325d1793f7	server : minor sync	2023-10-19 15:03:24 +03:00
Georgi Gerganov	9740824ba5	server : snake case	2023-10-19 14:44:37 +03:00
Georgi Gerganov	e3a2c3fe32	server : use refs + use llama_batch_clear()	2023-10-19 14:44:04 +03:00
Georgi Gerganov	3d5929e8ee	server : bug fix in ingest_images n_tokens is incremented internally by llama_batch_add	2023-10-19 14:43:19 +03:00
Georgi Gerganov	a8c981b734	server : remove beam-search functionality	2023-10-19 14:10:37 +03:00
Georgi Gerganov	654e0a1fe0	server : coding-style normalization (part 2)	2023-10-19 14:09:45 +03:00
Georgi Gerganov	e44ed60187	server : coding-style normalization	2023-10-19 13:50:23 +03:00
FSSRepo	ab2fc00224	latest changes of sampling API	2023-10-18 16:57:48 -04:00
FSSRepo	8540568c48	Merge branch 'master' of https://github.com/ggerganov/llama.cpp	2023-10-18 16:55:26 -04:00
FSSRepo	7196c4e08a	new sampling API	2023-10-18 16:50:09 -04:00
Steward Garcia	84b8f2b060	Merge branch 'ggerganov:master' into master	2023-10-18 08:43:17 -04:00
FSSRepo	35fd37430f	fix zig build	2023-10-17 18:04:26 -04:00
FSSRepo	c02c52efb5	fix multiple clients	2023-10-17 17:54:56 -04:00
FSSRepo	d2b1fac6c7	fix make bui;d errors	2023-10-17 17:18:56 -04:00
FSSRepo	ed0c11cb83	multimodal support enabled by default	2023-10-17 16:58:20 -04:00
FSSRepo	6c277eaab5	update api like OpenAI	2023-10-17 16:53:38 -04:00
FSSRepo	58f8ae9bfe	readme change	2023-10-17 16:32:19 -04:00
FSSRepo	fa0f22f14f	Merge remote-tracking branch 'upstream/master'	2023-10-17 16:31:33 -04:00
FSSRepo	aa2268f4cd	sync README.md changes	2023-10-17 16:21:05 -04:00
FSSRepo	4d1804330e	fix llava implementation	2023-10-16 16:31:17 -04:00
FSSRepo	d7eca255d7	context shift fixed	2023-10-16 14:43:10 -04:00
FSSRepo	2d9f11db28	fixed premature end due stop word	2023-10-16 12:36:05 -04:00
FSSRepo	fd64f04fc2	fix long prompt than ctx proposed in #3639	2023-10-15 19:07:18 -04:00
FSSRepo	b727e022d6	fix ci make build undefined ref errors	2023-10-15 18:53:48 -04:00
FSSRepo	ce961a304b	some ci fixes	2023-10-15 18:46:01 -04:00
Steward Garcia	9035978aae	Merge pull request #6 from damian0815/fssrepo_mac_fixes fix compilation errors with llvm	2023-10-15 18:38:52 -04:00
Steward Garcia	f47fd17b73	Merge branch 'ggerganov:master' into master	2023-10-15 18:23:47 -04:00
FSSRepo	4e5c5c451c	notify the user from server ui that multimodality is unavialable	2023-10-14 08:28:49 -04:00
Damian Stewart	299f6b54d8	fix compilation errors with llvm	2023-10-14 11:17:38 +02:00
FSSRepo	7e64bfe060	refactor code + remove unused comments + improved README.md	2023-10-14 00:31:34 -04:00
FSSRepo	9f72b44635	add multimodal input - alfa	2023-10-13 23:36:32 -04:00
FSSRepo	de35b47908	fixed tokens probs	2023-10-13 19:55:25 -04:00
FSSRepo	9d98cdda2c	llava multimodal integration	2023-10-13 18:42:44 -04:00
FSSRepo	eb08201227	add changes to README.md	2023-10-13 14:28:06 -04:00
FSSRepo	a2c2d98c16	add context swap	2023-10-13 14:12:50 -04:00
FSSRepo	b6d9e212e5	fixed timings per slot	2023-10-13 13:10:38 -04:00
FSSRepo	a410a9e300	unused change reverted	2023-10-13 12:23:58 -04:00
FSSRepo	6358ae5f48	server ui now support multiple clients	2023-10-13 12:22:54 -04:00
FSSRepo	4ba5a5013d	chat.mjs support cached prompt + some fixes	2023-10-13 11:06:41 -04:00
FSSRepo	500ac7120e	cached prompt support	2023-10-12 21:16:12 -04:00
FSSRepo	83c2b3553a	grammar + no stream completion	2023-10-12 18:43:57 -04:00
FSSRepo	5b8e29de53	multiple client support	2023-10-12 17:09:12 -04:00
FSSRepo	81484805f0	completion endpoint working	2023-10-12 16:17:27 -04:00
FSSRepo	29c8cdd65d	refactored sampling function	2023-10-12 15:02:19 -04:00
FSSRepo	b716eeb72a	Merge branch 'master' of https://github.com/ggerganov/llama.cpp	2023-10-12 12:55:08 -04:00
FSSRepo	78504218b9	save dev progress	2023-10-12 12:51:48 -04:00
FSSRepo	471230202d	crash fixed	2023-10-11 19:48:15 -04:00
FSSRepo	63f99b1ea6	implementing parallel decoding in server example	2023-10-11 18:14:11 -04:00