cmake : Handle mixed-case 'Power' strings in POWER CPU detection (#13966 )

Some systems report the CPU implementation as "Power11" instead of "POWER11". The existing CMake logic uses a case-sensitive regular expression to extract the CPU generation, which fails when the casing doesn't exactly match "POWER". This patch provides a fix by first converting the string to uppercase before applying the regex. Signed-off-by: root <root@rheldb2v.pperf.tadn.ibm.com> Co-authored-by: root <root@rheldb2v.pperf.tadn.ibm.com>
sycl: quantize and reorder the input to q8_1 when reorder is enabled (#13826 )
2026-06-30 17:47:40 +02:00 · 2025-06-02 15:18:36 +03:00 · 2025-06-02 10:12:20 +01:00 · 2025-06-01 18:08:05 +02:00 · 2025-06-01 18:07:21 +02:00 · 2025-06-01 17:23:11 +02:00
138 changed files with 7598 additions and 6138 deletions
@@ -49,6 +49,6 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

-[tools/mtmd/vendor/miniaudio.h]
+[vendor/miniaudio/miniaudio.h]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
@@ -130,6 +130,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Bindings</summary>

+- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -58,23 +58,20 @@ add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
-    chat.cpp
-    chat.h
    chat-parser.cpp
    chat-parser.h
+    chat.cpp
+    chat.h
    common.cpp
    common.h
    console.cpp
    console.h
-    json-schema-to-grammar.cpp
-    json.hpp
-    json-partial.h
    json-partial.cpp
+    json-partial.h
+    json-schema-to-grammar.cpp
    llguidance.cpp
    log.cpp
    log.h
-    minja/chat-template.hpp
-    minja/minja.hpp
    ngram-cache.cpp
    ngram-cache.h
    regex-partial.cpp
@@ -147,7 +144,7 @@ if (LLAMA_LLGUIDANCE)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
 endif ()

-target_include_directories(${TARGET} PUBLIC .)
+target_include_directories(${TARGET} PUBLIC . ../vendor)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

@@ -1,10 +1,11 @@
-#include "gguf.h" // for reading GGUF splits
 #include "arg.h"

+#include "chat.h"
 #include "common.h"
+#include "gguf.h" // for reading GGUF splits
+#include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
-#include "chat.h"

 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -15,6 +16,9 @@
 #include <windows.h>
 #endif

+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
@@ -34,8 +38,6 @@
 #include <future>
 #endif

-#include "json-schema-to-grammar.h"
-
 using json = nlohmann::ordered_json;

 std::initializer_list<enum llama_example> mmproj_examples = {
@@ -1346,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ));
    add_opt(common_arg(
        {"--prio"}, "N",
-        string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
+        string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
        [](common_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
+            if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
                throw std::invalid_argument("invalid value");
            }
            params.cpuparams.priority = (enum ggml_sched_priority) prio;
@@ -154,9 +154,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
            if (!rest.empty()) {
                handle_reasoning(rest, /* closed */ !is_partial());
            }
-            if (!syntax_.thinking_forced_open) {
-                throw common_chat_msg_partial_exception(end_think);
-            }
+            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
+            // if (!syntax_.thinking_forced_open) {
+            //     throw common_chat_msg_partial_exception(end_think);
+            // }
            return true;
        }
    }
@@ -2,9 +2,10 @@

 #include "chat.h"
 #include "json-partial.h"
-#include "json.hpp"
 #include "regex-partial.h"

+#include <nlohmann/json.hpp>
+
 #include <optional>
 #include <string>
 #include <vector>
@@ -1,13 +1,14 @@
 #include "chat.h"
 #include "chat-parser.h"
 #include "common.h"
+#include "json-partial.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
-#include "json-partial.h"
-#include "minja/chat-template.hpp"
-#include "minja/minja.hpp"
 #include "regex-partial.h"

+#include <minja/chat-template.hpp>
+#include <minja/minja.hpp>
+
 #include <cstdio>
 #include <exception>
 #include <iostream>
@@ -16,7 +17,6 @@
 #include <string>
 #include <vector>

-
 static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
    auto time = std::chrono::system_clock::to_time_t(now);
    auto local_time = *std::localtime(&time);
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {

    DWORD p = NORMAL_PRIORITY_CLASS;
    switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p = BELOW_NORMAL_PRIORITY_CLASS; break;
        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {

    int p = 0;
    switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p =  5;  break;
        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
@@ -1,9 +1,10 @@
-#include <json-partial.h>
-#include "ggml.h"
-#include "log.h"
-#include <string>
+#include "json-partial.h"

-#include <json.hpp>
+#include "log.h"
+
+#include <nlohmann/json.hpp>
+
+#include <string>

 using json = nlohmann::ordered_json;

@@ -1,5 +1,6 @@
 #pragma once
-#include <json.hpp>
+
+#include <nlohmann/json.hpp>

 // Healing marker (empty if the JSON was fully parsed / wasn't healed).
 struct common_healing_marker {
@@ -1,8 +1,9 @@
 #include "json-schema-to-grammar.h"
 #include "common.h"

+#include <nlohmann/json.hpp>
+
 #include <algorithm>
-#include <fstream>
 #include <map>
 #include <regex>
 #include <sstream>
@@ -1,9 +1,9 @@
 #pragma once

-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include <nlohmann/json_fwd.hpp>
+
+#include <functional>
+#include <string>

 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                   bool force_gbnf = false);
@@ -523,15 +523,15 @@ class TextModel(ModelBase):
            self.gguf_writer.add_context_length(n_ctx)
            logger.info(f"gguf: context length = {n_ctx}")

-        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
+        if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
            self.gguf_writer.add_embedding_length(n_embd)
            logger.info(f"gguf: embedding length = {n_embd}")

-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
+        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
            logger.info(f"gguf: feed forward length = {n_ff}")

-        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
+        if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
            self.gguf_writer.add_head_count(n_head)
            logger.info(f"gguf: head count = {n_head}")

@@ -674,12 +674,12 @@ class TextModel(ModelBase):
        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
            # ref: https://huggingface.co/tiiuae/falcon-7b
            res = "falcon"
-        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
-            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
-            res = "falcon3"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
            res = "bert-bge"
+        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
+            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
+            res = "falcon3"
        if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
            # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
            res = "bert-bge-large"
@@ -731,9 +731,6 @@ class TextModel(ModelBase):
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
-        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
-            res = "chatglm-bpe"
        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
            # ref: https://huggingface.co/LumiOpen/Viking-7B
            res = "viking"
@@ -764,9 +761,6 @@ class TextModel(ModelBase):
        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
            # ref: https://huggingface.co/facebook/chameleon-7b
            res = "chameleon"
-        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
-            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
-            res = "minerva-7b"
        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
            res = "roberta-bpe"
@@ -797,15 +791,24 @@ class TextModel(ModelBase):
        if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
            # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
            res = "llama4"
-        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
-            res = "glm4"
        if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
            # ref: https://huggingface.co/mistral-community/pixtral-12b
            res = "pixtral"
        if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
            # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
            res = "seed-coder"
+        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"
+        if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"
+        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
+            res = "glm4"
+        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
+            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
+            res = "minerva-7b"

        if res is None:
            logger.warning("\n")
@@ -1044,6 +1047,10 @@ class TextModel(ModelBase):
        special_vocab.chat_template = "rwkv-world"
        # hack: Add '\n\n' as the EOT token to make it chat normally
        special_vocab._set_special_token("eot", 261)
+        # hack: Override these as they have already been set (incorrectly)
+        special_vocab.special_token_ids["bos"] = 0
+        special_vocab.special_token_ids["eos"] = 0
+
        special_vocab.add_to_gguf(self.gguf_writer)

    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
@@ -3690,14 +3697,20 @@ class BertModel(TextModel):
        super().__init__(*args, **kwargs)
        self.vocab_size = None

+        if cls_out_labels := self.hparams.get("id2label"):
+            if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
+                # Remove dummy labels added by AutoConfig
+                cls_out_labels = None
+        self.cls_out_labels = cls_out_labels
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_causal_attention(False)
        self._try_set_pooling_type()

-        if cls_out_labels := self.hparams.get("id2label"):
+        if self.cls_out_labels:
            key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
-            self.gguf_writer.add_array(key_name, [v for k, v in sorted(cls_out_labels.items())])
+            self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])

    def set_vocab(self):
        tokens, toktypes, tokpre = self.get_vocab_base()
@@ -3749,7 +3762,7 @@ class BertModel(TextModel):
        if name.startswith("cls.seq_relationship"):
            return []

-        if self.hparams.get("id2label"):
+        if self.cls_out_labels:
            # For BertForSequenceClassification (direct projection layer)
            if name == "classifier.weight":
                name = "classifier.out_proj.weight"
@@ -3776,62 +3789,111 @@ class BertModel(TextModel):
        from sentencepiece import sentencepiece_model_pb2 as model

        tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
+
+        tokenizer_json = {}
+        tokenizer_config_json = {}
        if not tokenizer_path.is_file():
-            raise FileNotFoundError(f"File not found: {tokenizer_path}")
+            tokenizer_path = self.dir_model / 'tokenizer.json'
+            tokenizer_config_path = self.dir_model / 'tokenizer_config.json'

-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
-        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
-        assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
+            if not tokenizer_path.is_file():
+                raise FileNotFoundError(f"File not found: {tokenizer_path}")

-        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
-        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
-        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+            from base64 import b64decode
+            from transformers import AutoTokenizer
+            tokenizer = AutoTokenizer.from_pretrained(self.dir_model)

-        tokenizer = SentencePieceProcessor()
-        tokenizer.LoadFromFile(str(tokenizer_path))
+            with open(tokenizer_path, "r", encoding="utf-8") as fp:
+                tokenizer_json = json.load(fp)

-        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+            if tokenizer_config_path.is_file():
+                with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
+                    tokenizer_config_json = json.load(fp)
+
+            add_prefix = tokenizer.add_prefix_space
+            remove_whitespaces = tokenizer.clean_up_tokenization_spaces
+            precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
+
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
+        else:
+            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+            sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
+
+            add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+            remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
+            precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+
+            tokenizer = SentencePieceProcessor()
+            tokenizer.LoadFromFile(str(tokenizer_path))
+
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())

        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size

-        for token_id in range(tokenizer.vocab_size()):
-            piece = tokenizer.IdToPiece(token_id)
-            text = piece.encode("utf-8")
-            score = tokenizer.GetScore(token_id)
+        if isinstance(tokenizer, SentencePieceProcessor):
+            for token_id in range(tokenizer.vocab_size()):
+                piece = tokenizer.IdToPiece(token_id)
+                text = piece.encode("utf-8")
+                score = tokenizer.GetScore(token_id)

-            toktype = SentencePieceTokenTypes.NORMAL
-            if tokenizer.IsUnknown(token_id):
-                toktype = SentencePieceTokenTypes.UNKNOWN
-            elif tokenizer.IsControl(token_id):
-                toktype = SentencePieceTokenTypes.CONTROL
-            elif tokenizer.IsUnused(token_id):
-                toktype = SentencePieceTokenTypes.UNUSED
-            elif tokenizer.IsByte(token_id):
-                toktype = SentencePieceTokenTypes.BYTE
+                toktype = SentencePieceTokenTypes.NORMAL
+                if tokenizer.IsUnknown(token_id):
+                    toktype = SentencePieceTokenTypes.UNKNOWN
+                elif tokenizer.IsControl(token_id):
+                    toktype = SentencePieceTokenTypes.CONTROL
+                elif tokenizer.IsUnused(token_id):
+                    toktype = SentencePieceTokenTypes.UNUSED
+                elif tokenizer.IsByte(token_id):
+                    toktype = SentencePieceTokenTypes.BYTE

-            tokens[token_id] = text
-            scores[token_id] = score
-            toktypes[token_id] = toktype
+                tokens[token_id] = text
+                scores[token_id] = score
+                toktypes[token_id] = toktype
+        else:
+            added_vocab = tokenizer.get_added_vocab()
+            unk_token = tokenizer_config_json.get("unk_token")
+            unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))

-        if vocab_size > len(tokens):
-            pad_count = vocab_size - len(tokens)
-            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
-            for i in range(1, pad_count + 1):
-                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
-                scores.append(-1000.0)
-                toktypes.append(SentencePieceTokenTypes.UNUSED)
+            for token_id in range(tokenizer.vocab_size):
+                piece = tokenizer._convert_id_to_token(token_id)
+                if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
+                    text = piece.encode("utf-8")
+                    score = tokenizer_json["model"]["vocab"][token_id][1]

-        # realign tokens (see HF tokenizer code)
-        tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
-        scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
-        toktypes = [
-            SentencePieceTokenTypes.CONTROL,
-            SentencePieceTokenTypes.CONTROL,
-            SentencePieceTokenTypes.CONTROL,
-            SentencePieceTokenTypes.UNKNOWN,
-        ] + toktypes[3:-1]
+                    toktype = SentencePieceTokenTypes.NORMAL
+                    if token_id == unk_token_id:
+                        toktype = SentencePieceTokenTypes.UNKNOWN
+                    elif token_id in tokenizer.all_special_ids:
+                        toktype = SentencePieceTokenTypes.CONTROL
+                    elif token_id in added_vocab.values():
+                        toktype = SentencePieceTokenTypes.USER_DEFINED
+                    # No reliable way to detect this, but jina doesn't have any
+                    # elif tokenizer.IsByte(token_id):
+                    #     toktype = SentencePieceTokenTypes.BYTE
+
+                    tokens[token_id] = text
+                    scores[token_id] = score
+                    toktypes[token_id] = toktype
+
+        if isinstance(tokenizer, SentencePieceProcessor):
+            # realign tokens (see HF tokenizer code)
+            tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
+            scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
+            toktypes = [
+                SentencePieceTokenTypes.CONTROL,
+                SentencePieceTokenTypes.CONTROL,
+                SentencePieceTokenTypes.CONTROL,
+                SentencePieceTokenTypes.UNKNOWN,
+            ] + toktypes[3:-1]
+
+            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
+                # Add mask token missing from sentencepiece.bpe.model
+                tokens[250001] = b'<mask>'
+                scores[250001] = 0.0
+                toktypes[250001] = SentencePieceTokenTypes.CONTROL

        self.gguf_writer.add_tokenizer_model("t5")
        self.gguf_writer.add_tokenizer_pre("default")
@@ -3851,6 +3913,26 @@ class BertModel(TextModel):
        self.gguf_writer.add_add_eos_token(True)


+@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
+class DistilBertModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_layer_norm_eps(1e-12)
+        logger.info("gguf: layer norm epsilon = 1e-12")
+        super().set_gguf_parameters()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("distilbert."):
+            name = name[11:]
+
+        # These layers act as MLM head, so we don't need them
+        if name.startswith("vocab_"):
+            return []
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
 class RobertaModel(BertModel):
    model_arch = gguf.MODEL_ARCH.BERT
@@ -1,28 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

-# This script downloads the tokenizer models of the specified models from Huggingface and
-# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
-#
-# This is necessary in order to analyze the type of pre-tokenizer used by the model and
-# provide the necessary information to llama.cpp via the GGUF header in order to implement
-# the same pre-tokenizer.
-#
-# ref: https://github.com/ggml-org/llama.cpp/pull/6920
-#
-# Instructions:
-#
-# - Add a new model to the "models" list
-# - Run the script with your huggingface token:
-#
-#   python3 convert_hf_to_gguf_update.py <huggingface_token>
-#
-# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
-# - Update llama.cpp with the new pre-tokenizer if necessary
-#
-# TODO: generate tokenizer tests for llama.cpp
-#
-
 import logging
 import os
 import pathlib
@@ -32,6 +10,7 @@ import requests
 import sys
 import json
 import shutil
+import argparse

 from hashlib import sha256
 from enum import IntEnum, auto
@@ -41,6 +20,11 @@ logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("convert_hf_to_gguf_update")
 sess = requests.Session()

+convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
+convert_py = convert_py_pth.read_text(encoding="utf-8")
+hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
+hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
+

 class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
@@ -49,20 +33,49 @@ class TOKENIZER_TYPE(IntEnum):
    UGM = auto()


+DOC_STRING = """
+This script downloads the tokenizer models of the specified models from Huggingface and
+generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
+
+/!\\ It is intended to be used by contributors and is not meant to be run by end users
+
+This is necessary in order to analyze the type of pre-tokenizer used by the model and
+provide the necessary information to llama.cpp via the GGUF header in order to implement
+the same pre-tokenizer.
+
+ref: https://github.com/ggml-org/llama.cpp/pull/6920
+
+Instructions:
+
+- Add a new model to the "models" list
+- Run the script with your huggingface token
+    By default, token will be read from ~/.cache/huggingface/token
+- The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
+- Update llama.cpp with the new pre-tokenizer if necessary
+"""
+# TODO: generate tokenizer tests for llama.cpp
+
+parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument(
+    "--full", action="store_true",
+    help="download full list of models - make sure you have access to all of them",
+)
+parser.add_argument(
+    "hf_token",
+    help="optional HF token",
+    nargs="?",
+)
+args = parser.parse_args()
+hf_token = args.hf_token if args.hf_token is not None else hf_token
+
+if hf_token is None:
+    logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
+    sys.exit(1)
+
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

-if len(sys.argv) == 2:
-    token = sys.argv[1]
-    if not token.startswith("hf_"):
-        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
-        sys.exit(1)
-else:
-    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
-    sys.exit(1)
-
 # TODO: add models here, base models preferred
 models = [
    {"name": "llama-spm",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
@@ -103,7 +116,6 @@ models = [
    {"name": "exaone",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
    {"name": "phi-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
    {"name": "chameleon",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
-    {"name": "minerva-7b",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
    {"name": "roberta-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
    {"name": "gigachat",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
    {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
@@ -114,11 +126,19 @@ models = [
    {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
    {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
    {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
-    {"name": "glm4",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
    {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
 ]

+# some models are known to be broken upstream, so we will skip them as exceptions
+pre_computed_hashes = [
+    # chatglm-bpe has 2 hashes, why?
+    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
+    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
+    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
+    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
+]
+

 def download_file_with_auth(url, token, save_path):
    headers = {"Authorization": f"Bearer {token}"}
@@ -169,9 +189,29 @@ def download_model(model):
            if os.path.isfile(save_path):
                logger.info(f"{name}: File {save_path} already exists - skipping")
                continue
-            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+            download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)


+# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
+# returns mapping res --> chkhsh
+def get_existing_models(convert_py):
+    pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
+    matches = re.findall(pattern, convert_py)
+    output = {}
+    for chkhsh, res in matches:
+        output[res] = chkhsh
+    return output
+
+
+existing_models = {}
+all_models = models.copy()
+if not args.full:
+    # Filter out models that already exist in convert_hf_to_gguf.py
+    existing_models = get_existing_models(convert_py)
+    all_models = models.copy()
+    models = [model for model in all_models if model["name"] not in existing_models]
+
+logging.info(f"Downloading {len(models)} models...")
 for model in models:
    try:
        download_model(model)
@@ -182,9 +222,10 @@ for model in models:
 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:

 src_ifs = ""
-for model in models:
+for model in [*all_models, *pre_computed_hashes]:
    name = model["name"]
    tokt = model["tokt"]
+    chkhsh = model.get("chkhsh")

    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
        continue
@@ -195,35 +236,44 @@ for model in models:
        continue

    # create the tokenizer
-    try:
-        if name == "t5":
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
-        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
-        continue  # Skip to the next model if the tokenizer can't be loaded
+    if chkhsh is not None:
+        # if the model has a pre-computed hash, use it
+        logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
+    elif name in existing_models:
+        # if the model already exists in convert_hf_to_gguf.py, skip compute hash
+        chkhsh = existing_models[name]
+    else:
+        # otherwise, compute the hash of the tokenizer
+        try:
+            logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
+            if name == "t5":
+                tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        except OSError as e:
+            logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
+            continue  # Skip to the next model if the tokenizer can't be loaded

-    chktok = tokenizer.encode(CHK_TXT)
-    chkhsh = sha256(str(chktok).encode()).hexdigest()
+        chktok = tokenizer.encode(CHK_TXT)
+        chkhsh = sha256(str(chktok).encode()).hexdigest()

-    logger.info(f"model: {name}")
-    logger.info(f"tokt: {tokt}")
-    logger.info(f"repo: {model['repo']}")
-    logger.info(f"chktok: {chktok}")
-    logger.info(f"chkhsh: {chkhsh}")
+        logger.info(f"model: {name}")
+        logger.info(f"tokt: {tokt}")
+        logger.info(f"repo: {model['repo']}")
+        logger.info(f"chktok: {chktok}")
+        logger.info(f"chkhsh: {chkhsh}")

-    # print the "pre_tokenizer" content from the tokenizer.json
-    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
-        cfg = json.load(f)
-        normalizer = cfg["normalizer"]
-        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
-        pre_tokenizer = cfg["pre_tokenizer"]
-        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
-        if "ignore_merges" in cfg["model"]:
-            logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
+        # print the "pre_tokenizer" content from the tokenizer.json
+        with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
+            cfg = json.load(f)
+            normalizer = cfg["normalizer"]
+            logger.info("normalizer: " + json.dumps(normalizer, indent=4))
+            pre_tokenizer = cfg["pre_tokenizer"]
+            logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+            if "ignore_merges" in cfg["model"]:
+                logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))

-    logger.info("")
+        logger.info("")

    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
    src_ifs += f"            # ref: {model['repo']}\n"
@@ -271,8 +321,6 @@ src_func = f"""
        return res
 """

-convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
-convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
    lambda m: m.group(1) + src_func + m.group(3),
@@ -367,6 +415,10 @@ for model in models:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop

+    if not os.path.exists(f"models/ggml-vocab-{name}.gguf"):
+        logger.info(f"Skip vocab files for model {name}, no GGUF file found")
+        continue
+
    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
        for text in tests:
            f.write(f"{text}")
@@ -63,6 +63,7 @@ cmake --build build --config Release
      cmake --preset x64-windows-llvm-release
      cmake --build build-x64-windows-llvm-release
      ```
+- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.

 ## BLAS Build

@@ -4,7 +4,7 @@ Simplified simulation of serving incoming requests in parallel

 ## Example

-Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
+Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of up to 10 junk questions (`--junk 10`) followed by the actual question.

 ```bash
 llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
    common_params params;

    params.n_predict = 128;
-    params.n_junk = 0;
+    params.n_junk = 1;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
    const bool is_sp_shared = params.is_pp_shared;

    // extra text to insert in each client's prompt in order to make it larger
-    const int32_t n_junk = params.n_junk;
+    const int32_t n_junk = std::max(1, params.n_junk);

    // init llama.cpp
    llama_backend_init();
@@ -315,7 +315,10 @@ int main(int argc, char ** argv) {
                    } else {
                        client.prompt += k_system;
                    }
-                    for (int i = 0; i < n_junk; ++i) {
+
+                    const int n_junk_cur = rand() % n_junk;
+
+                    for (int i = 0; i < n_junk_cur; ++i) {
                        const int r = rand() % k_questions.size();
                        client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
                    }
@@ -340,7 +343,7 @@ int main(int argc, char ** argv) {
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;

-                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);

                    g_seq_id += 1;

@@ -359,7 +362,9 @@ int main(int argc, char ** argv) {
        // process in chunks of params.n_batch
        int32_t n_batch = params.n_batch;

-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+        int32_t i_next = 0;
+
+        for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
            // experiment: process in powers of 2
            //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
            //    n_batch /= 2;
@@ -367,7 +372,7 @@ int main(int argc, char ** argv) {
            //    continue;
            //}

-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);

            llama_batch batch_view = {
                n_tokens,
@@ -387,19 +392,24 @@ int main(int argc, char ** argv) {
                    return 1;
                }

-                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);

                n_cache_miss += 1;

                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
-                i -= n_batch;

                continue;
            }

            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);

+            // move the head of the batch forward with the number of tokens we just processed
+            i_next = i + n_tokens;
+
+            // on successful decode, restore the original batch size
+            n_batch = params.n_batch;
+
            for (auto & client : clients) {
                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
                    continue;
@@ -133,9 +133,8 @@ int main(int argc, char ** argv) {
            const int ib = i/n_batch - 1;
            const int bd = n_batch_grp*(n_grp - 1);

-            llama_kv_self_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_self_update  (ctx);
+            llama_kv_self_seq_add(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);

            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
        }
@@ -169,8 +168,6 @@ int main(int argc, char ** argv) {

        llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
        llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_self_defrag (ctx);
-        llama_kv_self_update (ctx);

        n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;

@@ -200,8 +197,6 @@ int main(int argc, char ** argv) {

            llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
            llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_self_defrag (ctx);
-            llama_kv_self_update (ctx);

            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
        }
@@ -24,3 +24,28 @@ function(ggml_get_flags CCID CCVER)
    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
 endfunction()
+
+function(ggml_get_system_arch)
+    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+        set(GGML_SYSTEM_ARCH "ARM" PARENT_SCOPE)
+    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR
+            CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
+        set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
+    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
+            "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
+        set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        set(GGML_SYSTEM_ARCH "loongarch64"  PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
+        set(GGML_SYSTEM_ARCH "riscv64" PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+        set(GGML_SYSTEM_ARCH "s390x" PARENT_SCOPE)
+    else()
+        set(GGML_SYSTEM_ARCH "UNKNOWN" PARENT_SCOPE)
+    endif()
+endfunction()
@@ -2095,9 +2095,6 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
    GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);

-    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
-    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
-
    // print info and performance information for the graph
    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

@@ -2181,6 +2178,7 @@ extern "C" {

    // scheduling priorities
    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_LOW = -1,
        GGML_SCHED_PRIO_NORMAL,
        GGML_SCHED_PRIO_MEDIUM,
        GGML_SCHED_PRIO_HIGH,
@@ -109,6 +109,8 @@ if (MSVC)
 else ()
    set(CMAKE_GENERATOR_PLATFORM_LWR "")
 endif ()
+ggml_get_system_arch()
+message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")

 if (NOT MSVC)
    if (GGML_STATIC)
@@ -194,6 +196,7 @@ add_library(ggml-base
            ../include/ggml-opt.h
            ../include/gguf.h
            ggml.c
+            ggml.cpp
            ggml-alloc.c
            ggml-backend.cpp
            ggml-opt.cpp
@@ -224,6 +227,7 @@ function(ggml_add_backend_library backend)
        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
        add_dependencies(ggml ${backend})
+        install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
    else()
        add_library(${backend} ${ARGN})
        target_link_libraries(ggml PUBLIC ${backend})
@@ -287,16 +291,20 @@ if (GGML_CPU_ALL_VARIANTS)
    if (NOT GGML_BACKEND_DL)
        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
    endif()
-    ggml_add_cpu_backend_variant(x64)
-    ggml_add_cpu_backend_variant(sse42        SSE42)
-    ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
-    ggml_add_cpu_backend_variant(haswell      SSE42 AVX F16C AVX2 BMI2 FMA)
-    ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
-    ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-    ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
-    if (NOT MSVC)
-        # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+    if (GGML_SYSTEM_ARCH STREQUAL "x86")
+        ggml_add_cpu_backend_variant(x64)
+        ggml_add_cpu_backend_variant(sse42        SSE42)
+        ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
+        ggml_add_cpu_backend_variant(haswell      SSE42 AVX F16C AVX2 BMI2 FMA)
+        ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
+        ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+        ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+        if (NOT MSVC)
+            # MSVC doesn't support AMX
+            ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+        endif()
+    else()
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
    endif()
 elseif (GGML_CPU)
    ggml_add_cpu_backend_variant_impl("")
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
    // allocate graph
    if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
        // the re-allocation may cause the split inputs to be moved to a different address
-        ggml_backend_sched_synchronize(sched);
+        // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
+        for (int i = 0; i < sched->n_backends; i++) {
+            ggml_backend_synchronize(sched->backends[i]);
+        }
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra

    ggml_backend_sched_split_graph(sched, graph);

-
    if (!ggml_backend_sched_alloc_splits(sched)) {
        return false;
    }
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
    for (int i = 0; i < sched->n_backends; i++) {
        ggml_backend_synchronize(sched->backends[i]);
    }
-    // reset the current copy to 0 so that the graphs will be similar during generation
-    // necessary for CUDA graphs
-    sched->cur_copy = 0;
+    if (!sched->is_alloc) {
+        // if the graph is not already allocated, always use copy 0 after a synchronization
+        // this ensures that during generation the same copy is used every time,
+        // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
+        sched->cur_copy = 0;
+    }
 }

 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
@@ -81,7 +81,7 @@ if (BLAS_FOUND)
    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
 else()
-    message(ERROR "BLAS not found, please refer to "
-                  "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-                  " to set correct GGML_BLAS_VENDOR")
+    message(FATAL_ERROR "BLAS not found, please refer to "
+                        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+                        " to set correct GGML_BLAS_VENDOR")
 endif()
@@ -82,13 +82,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
    endif()

-    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
-        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
-
+    if (GGML_SYSTEM_ARCH STREQUAL "ARM")
        message(STATUS "ARM detected")
-
        if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
        else()
@@ -170,12 +165,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                endforeach()
            endif()
        endif()
-    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
-
+    elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
        message(STATUS "x86 detected")
-
        if (MSVC)
            # instruction set detection for MSVC only
            if (GGML_NATIVE)
@@ -318,7 +309,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
        endif()
-    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
+    elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
        message(STATUS "PowerPC detected")
        if (GGML_NATIVE)
            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
@@ -327,7 +318,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
            endif()

-            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
+            string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
+            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
@@ -344,9 +336,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
            endif()
        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+    elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
        message(STATUS "loongarch64 detected")
-
        list(APPEND ARCH_FLAGS -march=loongarch64)
        if (GGML_LASX)
            list(APPEND ARCH_FLAGS -mlasx)
@@ -354,8 +345,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        if (GGML_LSX)
            list(APPEND ARCH_FLAGS -mlsx)
        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
-        message(STATUS "RISC-V detected")
+    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
+        message(STATUS "riscv64 detected")
        if (GGML_RVV)
            if (GGML_XTHEADVECTOR)
                list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
@@ -365,7 +356,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
            endif()
        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
        message(STATUS "s390x detected")
        file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
        string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
@@ -6995,7 +6995,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
+#ifdef __ARM_FEATURE_MATMUL_INT8
+    assert((nrc == 2) || (nrc == 1));
+#else
    assert(nrc == 1);
+#endif
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
@@ -7012,6 +7016,146 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

    uint32_t utmp[4];

+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_K * GGML_RESTRICT x0 = x;
+        const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
+        const block_q8_K * GGML_RESTRICT y0 = y;
+        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0f);
+
+        float32x4_t vfsum = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
+            const uint8_t * GGML_RESTRICT qx0 = x0->qs;
+            const uint8_t * GGML_RESTRICT qx1 = x1->qs;
+            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
+            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
+
+            // decode scales and mins
+            int8_t x0_scales[8], x1_scales[8];
+            int16x8_t x0_mins, x1_mins;
+            {
+                uint32_t scales_mins[3];
+                memcpy(scales_mins, x0->scales, 12);
+                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
+                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
+                const uint32x2_t mins = {mins_0_3, mins_4_7};
+                x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
+                uint32_t scales[2];
+                scales[0] = scales_mins[0] & kmask1; // scales 0~3
+                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
+                memcpy(x0_scales, scales, 8);
+            }
+            {
+                uint32_t scales_mins[3];
+                memcpy(scales_mins, x1->scales, 12);
+                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
+                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
+                const uint32x2_t mins = {mins_0_3, mins_4_7};
+                x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
+                uint32_t scales[2];
+                scales[0] = scales_mins[0] & kmask1; // scales 0~3
+                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
+                memcpy(x1_scales, scales, 8);
+            }
+
+            int32x4_t visum = {0};
+
+            // process 64 data points per iteration, totally 256 data points
+            for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) {
+                const int8x16x4_t vy0 = vld1q_s8_x4(qy0);
+                const int8x16x4_t vy1 = vld1q_s8_x4(qy1);
+
+                int8x16_t vx0[4], vx1[4];
+                {
+                    const uint8x16x2_t vv = vld1q_u8_x2(qx0);
+                    vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
+                    vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
+                    vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
+                    vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
+                }
+                {
+                    const uint8x16x2_t vv = vld1q_u8_x2(qx1);
+                    vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
+                    vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
+                    vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
+                    vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
+                }
+
+                // process 32 data points (share same block scale) per iteration
+                for (int k = 0; k < 2; ++k) {
+                    const int blk = j * 2 + k;
+                    const int32x4_t block_scale = {
+                        x0_scales[blk],
+                        x0_scales[blk],
+                        x1_scales[blk],
+                        x1_scales[blk],
+                    };
+
+                    int32x4_t vr = {0};
+                    for (int l = 0; l < 2; ++l) {
+                        const int idx = k * 2 + l;
+                        const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]);
+                        const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]);
+                        const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]);
+                        const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]);
+                        const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64));
+                        const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64));
+                        const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64));
+                        const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64));
+                        vr = vmmlaq_s32(vr, vx_l, vy_l);
+                        vr = vmmlaq_s32(vr, vx_h, vy_h);
+                    }
+                    // apply block scale, will NOT overflow
+                    // block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits
+                    visum = vmlaq_s32(visum, vr, block_scale);
+                }
+            }
+
+            // adjust bias, apply superblock scale
+            {
+                int32_t bias[4];
+                // no obvious uplift from sve sdot-16, just use neon mul add
+                const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8));
+                const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8));
+                bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)),
+                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins))));
+                bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)),
+                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins))));
+                bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)),
+                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins))));
+                bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
+                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
+                const float32x4_t dmins = {
+                    GGML_FP16_TO_FP32(x0->dmin) * y0->d,
+                    GGML_FP16_TO_FP32(x0->dmin) * y1->d,
+                    GGML_FP16_TO_FP32(x1->dmin) * y0->d,
+                    GGML_FP16_TO_FP32(x1->dmin) * y1->d,
+                };
+                vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
+
+                const float32x4_t superblock_scale = {
+                    GGML_FP16_TO_FP32(x0->d) * y0->d,
+                    GGML_FP16_TO_FP32(x0->d) * y1->d,
+                    GGML_FP16_TO_FP32(x1->d) * y0->d,
+                    GGML_FP16_TO_FP32(x1->d) * y1->d,
+                };
+                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
+            }
+        }
+
+        // vfsum = ABCD -> ACBD
+        // AC -> s, BD -> (s+bs)
+        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
+        vst1_f32(s,      vget_low_f32 (vfsum));
+        vst1_f32(s + bs, vget_high_f32(vfsum));
+
+        return;
+    }
+#endif
+
 #ifdef __ARM_FEATURE_SVE
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
@@ -270,7 +270,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_q4_K,
        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
        .nrows                    = 1,
+#endif
    },
    [GGML_TYPE_Q5_K] = {
        .from_float               = quantize_row_q5_K,
@@ -2414,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
    // This is up to the applications.
    DWORD p = THREAD_PRIORITY_NORMAL;
    switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p = THREAD_PRIORITY_BELOW_NORMAL;  break;
        case GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
        case GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
        case GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
        case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
    }

+    if (prio != GGML_SCHED_PRIO_LOW) {
+        // Tell Windows that this thread should not be throttled (needs its own CPU core).
+        // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
+        // all our threads onto the first 4 cores which results in terrible performance with
+        // n_threads > 4
+        #if _WIN32_WINNT >= 0x0602
+        THREAD_POWER_THROTTLING_STATE t;
+        ZeroMemory(&t, sizeof(t));
+        t.Version     = THREAD_POWER_THROTTLING_CURRENT_VERSION;
+        t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
+        t.StateMask   = 0;
+
+        if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
+            GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
+            return false;
+        }
+        #endif
+    }
+
    if (prio == GGML_SCHED_PRIO_NORMAL) {
        // Keep inherited policy/priority
        return true;
@@ -2447,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
    struct sched_param p;
    int32_t policy = SCHED_OTHER;
    switch (prio) {
+        // TODO: there seems to be no way to set lower prio on Apple platforms
+        case GGML_SCHED_PRIO_LOW:      policy = SCHED_OTHER; p.sched_priority = 0;  break;
        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
@@ -2503,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
    struct sched_param p;
    int32_t policy = SCHED_OTHER;
    switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      policy = SCHED_BATCH; p.sched_priority = 0;  break;
        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
@@ -7633,39 +7633,83 @@ static void ggml_compute_forward_ssm_scan_f32(
    const int ir1 = MIN(ir0 + dr, nr);
    const int ir  = ir1 - ir0;

-    for (int i3 = 0; i3 < n_s; ++i3) {
-        for (int i2 = 0; i2 < n_t; ++i2) {
-            const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
-            const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-            const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
-            const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-            const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
-            const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
-                float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-                float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+    #ifdef __ARM_FEATURE_SVE
+        for (int i3 = 0; i3 < n_s; ++i3) {
+            for (int i2 = 0; i2 < n_t; ++i2) {
+                const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+                const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+                const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+                const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+                const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                    float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                    float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}

-            // use the output as the source for the next token-wise iterations
-            if (i2 > 0) { s0 = s; }
+                // use the output as the source for the next token-wise iterations
+                if (i2 > 0) { s0 = s; }

-            // d_inner
-            for (int i1 = 0; i1 < ir; ++i1) {
-                // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
-                float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
-                float x_dt = x[i1] * dt_soft_plus;
-                float sumf = 0.0f;
-                // d_state
-                for (int i0 = 0; i0 < nc; ++i0) {
-                    int i = i0 + i1*nc;
-                    // state = prev_state * dA + dB * x
-                    float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
-                    // y = rowwise_dotprod(state, C)
-                    sumf += state * C[i0];
-                    s[i] = state;
+                // d_inner
+                for (int i1 = 0; i1 < ir; ++i1) {
+                    float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                    float x_dt = x[i1] * dt_soft_plus;
+                    svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
+                    svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
+                    svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
+
+                    for (int64_t k = 0; k < nc; k += svcntw()) {
+                        svfloat32_t vA = GGML_F32_VEC_LOAD(&A[i1*nc + k]);
+                        svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k]);
+                        svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k]);
+                        svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[i1*nc + k]);
+
+                        svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
+                        t1 = exp_ps_sve(svptrue_b32(), t1);
+                        svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
+
+                        vs0 = GGML_F32_VEC_FMA(vs0, t1, t2);
+                        r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
+
+                        GGML_F32_VEC_STORE(&s[i1*nc + k], vs0);
+                    }
+                    y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector);
                }
-                y[i1] = sumf;
            }
        }
-    }
+    #else
+        for (int i3 = 0; i3 < n_s; ++i3) {
+            for (int i2 = 0; i2 < n_t; ++i2) {
+                const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+                const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+                const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+                const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+                const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                    float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                    float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+
+                // use the output as the source for the next token-wise iterations
+                if (i2 > 0) { s0 = s; }
+
+                // d_inner
+                for (int i1 = 0; i1 < ir; ++i1) {
+                    // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+                    float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                    float x_dt = x[i1] * dt_soft_plus;
+                    float sumf = 0.0f;
+                    // d_state
+                    for (int i0 = 0; i0 < nc; ++i0) {
+                        int i = i0 + i1*nc;
+                        // state = prev_state * dA + dB * x
+                        float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                        // y = rowwise_dotprod(state, C)
+                        sumf += state * C[i0];
+                        s[i] = state;
+                    }
+                    y[i1] = sumf;
+                }
+            }
+        }
+    #endif
 }

 void ggml_compute_forward_ssm_scan(
@@ -647,6 +647,42 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
 #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
 #endif

+/* Below function was borrowed from the GitHub repository:
+https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
+#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+    inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
+        // Constants
+        const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
+        const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
+        const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
+        const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
+        const svfloat32_t one = svdup_n_f32(1.0f);
+        const svfloat32_t inactive1 = svdup_n_f32(0.0f);
+        const svint32_t inactive2 = svdup_n_s32(0);
+
+        // Algorithm starts here
+        svfloat32_t t0 = svmul_f32_m(pg, src, log2_e);  // y = x * log2(e)
+        svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0);         // rount to int (float)
+        svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1);         // n
+
+        t1 = svsub_f32_m(pg, t0, t1);   // a = y - floor(y)
+        t1 = svadd_f32_m(pg, t1, one);  // b = a + 1
+
+        svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17);  // v = b >> 17 (u32)
+        svfloat32_t t4 = svexpa_f32(t3);                                   // c = fexpa(v)
+        t4 = svscale_f32_m(pg, t4, t2);                                    // fexpa(v) * 2^(n)
+
+        // and_(t2.d, t1.d, not_mask17.d)
+        svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
+        t5 = svsub_f32_m(pg, t1, t5);                // z
+        t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq);  // ln2 + half_ln2_sq * z
+        t0 = svmla_f32_m(pg, one, t5, t0);           // 1 + (ln2 * z) + (half_ln2_sq * z * z)
+        t0 = svmul_f32_m(pg, t0, t4);                // Final result
+
+        return t0;
+    }
+#endif
+
 #if defined(__ARM_NEON) && defined(__aarch64__)

 // adapted from arm limited optimized routine
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
        int     nsm;                // number of streaming multiprocessors
        size_t  smpb;               // max. shared memory per block
        size_t  smpbo;              // max. shared memory per block (with opt-in)
+        bool    integrated;         // Device is integrated as opposed to discrete
        bool    vmm;                // virtual memory support
        size_t  vmm_granularity;    // granularity of virtual memory
        size_t  total_vram;
@@ -1246,7 +1246,7 @@ static __global__ void flash_attn_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
-#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING

    static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");

@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {

        info.default_tensor_split[id] = total_vram;
        total_vram += prop.totalGlobalMem;
-
-        info.devices[id].nsm       = prop.multiProcessorCount;
-        info.devices[id].smpb      = prop.sharedMemPerBlock;
-        info.devices[id].warp_size = prop.warpSize;
+        info.devices[id].integrated = prop.integrated;
+        info.devices[id].nsm        = prop.multiProcessorCount;
+        info.devices[id].smpb       = prop.sharedMemPerBlock;
+        info.devices[id].warp_size  = prop.warpSize;
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
        info.devices[id].smpbo = prop.sharedMemPerBlock;

@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
    GGML_UNUSED(buft);
 }

+static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
+}
+
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    CUDA_CHECK(cudaFreeHost(buffer->context));
 }
@@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {

 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
    bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
+    // flag used to determine whether it is an integrated_gpu
+    const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;

    while (!graph_evaluated_or_captured) {
        // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                    if (node->src[j] != nullptr) {
                        assert(node->src[j]->buffer);
                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
-                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
+                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
                    }
                }
 #endif
@@ -2994,9 +3000,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            {
                struct ggml_tensor * a = op->src[0];
                struct ggml_tensor * b = op->src[1];
-                // for small weight matrices the active device can end up without any rows, don't use row split in those cases
-                // this avoids some edge cases (and the performance would not be good anyways)
                if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
+                    if (a->ne[2] > 1 || a->ne[3] > 1) {
+                        return false;
+                    }
+                    // for small weight matrices the active device can end up without any rows, don't use row split in those cases
+                    // this avoids some edge cases (and the performance would not be good anyways)
                    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
                    int64_t row_low;
                    int64_t row_high;
@@ -3263,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 }

 static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
+    const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
+    return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
 }

 static int64_t get_op_batch_size(const ggml_tensor * op) {
@@ -32,6 +32,8 @@
 extern "C" {
 #endif

+void ggml_print_backtrace(void);
+
 #ifndef MIN
 #    define MIN(a, b) ((a) < (b) ? (a) : (b))
 #endif
@@ -13,7 +13,7 @@ elseif(SUPPORTS_SYCL)
        If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
        source /opt/intel/oneapi/setvars.sh")
 else()
-    message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
+    message(FATAL_ERROR "C++ compiler lacks SYCL support.")
 endif()
 message(STATUS "SYCL found")
 #todo: AOT
@@ -170,7 +170,7 @@ else()
        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
    elseif (GGML_SYCL_TARGET STREQUAL "AMD")
        if (NOT GGML_SYCL_DEVICE_ARCH)
-            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
+            message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
        endif()
        target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
@@ -1434,6 +1434,59 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
    reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
 }

+template <int ElementsPerWI>
+static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
+                                                      const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
+    /*
+        Quantizes and reorders the resultant q8 tensor in a per row fashion
+        Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
+    */
+
+    auto subgroup_id = it.get_group(0);
+    auto wi_id       = it.get_local_id(0);
+
+    const int num_blocks_per_row = kx / QK8_1;
+    auto      row                = subgroup_id / num_blocks_per_row;
+    auto      col                = subgroup_id % num_blocks_per_row;
+
+    auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
+    auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
+
+    auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
+    auto ds_ptr    = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
+
+    sycl::vec<float, ElementsPerWI>  wi_f32_vals;
+    sycl::vec<int8_t, ElementsPerWI> quantized_values;
+
+    auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
+    wi_f32_vals           = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
+
+    float sum  = 0.0f;
+    float amax = 0.0f;
+
+#pragma unroll(ElementsPerWI)
+    for (int i = 0; i < ElementsPerWI; i++) {
+        sum += wi_f32_vals[i];
+        amax                = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
+        quantized_values[i] = 0;
+    }
+    sum     = sycl::reduce_over_group(it.get_group(), sum, sycl::plus<float>());
+    amax    = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum<float>());
+    float d = amax == 0 ? 1 : amax / 127;
+
+#pragma unroll(ElementsPerWI)
+    for (int i = 0; i < ElementsPerWI; i++) {
+        quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
+    }
+
+    d = amax == 0 ? 0 : d;
+
+    *reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
+    if (wi_id == 0) {
+        *ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
+    }
+}
+
 static void mul_mat_p021_f16_f32(
    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
@@ -1718,23 +1771,30 @@ static  void pool2d_nchw_kernel(
        o_ptr[cur_oh * ow + cur_ow] = res;
 }

-static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
-                                   const int ky, const int kx_padded,
-                                   queue_ptr stream) {
-    const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
-    const sycl::range<3> num_blocks(1, ky, block_num_x);
-    int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
-    static_assert(QK8_1 % WARP_SIZE == 0);
-    const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
+static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
+                                   bool reorder_q8_tensor, queue_ptr stream) {
+    if (reorder_q8_tensor) {
+        auto local_range      = std::size_t(WARP_SIZE);
+        auto num_quant_blocks = ky * (kx / QK8_1);
+        auto global_range     = num_quant_blocks * local_range;
+        stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
+                             [=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                 quantize_and_reorder_q8_1<QK8_1 / WARP_SIZE>(x, vy, kx, kx_padded, it);
+                             });
+    } else {
+        const int            block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
+        const sycl::range<3> num_blocks(1, ky, block_num_x);
+        int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
+        static_assert(QK8_1 % WARP_SIZE == 0);
+        const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
+        {
+            dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-        stream->parallel_for(
-            sycl::nd_range<3>(num_blocks * block_size, block_size),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
-            });
+            stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size),
+                                 [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                     quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
+                                 });
+        }
    }
 }

@@ -2446,9 +2506,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
            dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);

            if (src1_on_device && src1_is_contiguous) {
+                bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder;
                scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
                                                     /*num_src=*/2, " : converting src1 to Q8_1");
-                quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
+                quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream);
                /*
                DPCT1010:90: SYCL uses exceptions to report errors and does not
                use the error codes. The call was replaced with 0. You need to
@@ -2554,7 +2615,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
                    scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
                                                         /*num_src=*/2, " : converting src1 to Q8_1");
-                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream);
                    /*
                    DPCT1010:92: SYCL uses exceptions to report errors and does
                    not use the error codes. The call was replaced with 0. You
@@ -4257,14 +4318,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-                // mode is not used as a bitmask in practice, the various rope type modes are independent implementations
-                if (mode == GGML_ROPE_TYPE_MROPE) {
-                    return false;
-                }
-                return true;
-            }
        case GGML_OP_IM2COL:
            return true;
        case GGML_OP_UPSCALE:
@@ -29,8 +29,6 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
    static_assert(blocks_per_subgroup > 0);
    static_assert(block_elements_per_subgroup > 0);

-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
    float partial_sum = 0.0f;
    for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
        const int ibx       = row * blocks_per_row + i;  // x block index
@@ -40,13 +38,15 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r

        // Y block index that aligns with ibx
        const int iby = i * block_type::block_to_q8_1_ratio();
+        const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
+        const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2));

 #pragma unroll
        for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
            // x block quant index when casting the quants to int
            const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);

-            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
+            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs, nblocks);
        }
    }

@@ -49,10 +49,7 @@ static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const

    if (i0 >= n_dims) {
        const int i = row * ne0 + i0;
-
-        dst[i + 0] = x[i + 0];
-        dst[i + 1] = x[i + 1];
-
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
        return;
    }

@@ -93,10 +90,7 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const

    if (i0 >= n_dims) {
        const int i = row * ne0 + i0;
-
-        dst[i + 0] = x[i + 0];
-        dst[i + 1] = x[i + 1];
-
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
        return;
    }

@@ -122,6 +116,63 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
    dst[i + n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
 }

+template <typename T, bool has_ff>
+static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
+                        const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
+                        const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
+                        const float theta_scale, const float * freq_factors, const mrope_sections sections,
+                        const sycl::nd_item<3> & item_ct1) {
+    // get index pos
+    const int i0 = 2 * (item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1));
+    if (i0 >= ne0) {
+        return;
+    }
+    const int    row_dst   = (item_ct1.get_group(2) * item_ct1.get_local_range(2)) + item_ct1.get_local_id(2);
+
+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
+        return;
+    }
+
+    const int    row_x     = row_dst % ne1;
+    const int    channel_x = row_dst / ne1;
+    const int    idst      = (row_dst * ne0) + (i0 / 2);
+    const size_t ix        = ((size_t) channel_x * s2) + ((size_t) row_x * s1) + (i0 / 2);
+
+    const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
+    const int sec_w = sections.v[1] + sections.v[0];
+    const int sector = (i0 / 2) % sect_dims;
+
+
+    float theta_base = 0.0;
+    if (sector < sections.v[0]) {
+        theta_base = pos[channel_x]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sections.v[0] && sector < sec_w) {
+        theta_base = pos[channel_x + ne2 * 1]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 2]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 3]*sycl::pow(theta_scale, i0/2.0f);
+    }
+
+    const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
+    float       cos_theta;
+    float       sin_theta;
+    rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + n_dims/2];
+
+    // store results in dst
+    dst[idst + 0]      = x0 * cos_theta - x1 * sin_theta;
+    dst[idst + n_dims/2] = x0 * sin_theta + x1 * cos_theta;
+}
+
+
+
 template <typename T, bool has_ff>
 static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
                        const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
@@ -171,7 +222,7 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
                           const float * freq_factors, queue_ptr stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int            num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
+    const int            num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
    const sycl::range<3> block_nums(1, num_blocks_x, nr);

    const float theta_scale = powf(freq_base, -2.0f / n_dims);
@@ -208,7 +259,7 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
                           const rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int            num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
+    const int            num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
    const sycl::range<3> block_nums(1, num_blocks_x, nr);

    const float theta_scale = powf(freq_base, -2.0f / n_dims);
@@ -228,6 +279,40 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
    }
 }

+template <typename T>
+static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
+                             const size_t s2, const int n_dims, const int nr, const int32_t * pos,
+                             const float freq_scale, const float freq_base, const float ext_factor,
+                             const float attn_factor, const rope_corr_dims corr_dims, const float * freq_factors,
+                             const mrope_sections sections, queue_ptr stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const sycl::range<3>    block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
+    const int               n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
+    const sycl::range<3>    grid_dims(1, n_blocks_y, nr);
+    const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
+
+    const float theta_scale = std::pow(freq_base, -2.0f / n_dims);
+    // Add FP16 capability check if T could be sycl::half
+    if constexpr (std::is_same_v<T, sycl::half>) {
+        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+    }
+    // launch kernel
+    if (freq_factors == nullptr) {
+        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
+            rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
+                                  corr_dims, theta_scale, freq_factors, sections, item_ct1);
+        });
+    } else {
+        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
+            rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
+                                 corr_dims, theta_scale, freq_factors, sections, item_ct1);
+        });
+    }
+}
+
+
+
+
 // rope vision
 template <typename T>
 static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
@@ -237,7 +322,7 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
                             const mrope_sections sections, queue_ptr stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const sycl::range<3>    block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int               n_blocks_y = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
+    const int               n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
    const sycl::range<3>    grid_dims(1, n_blocks_y, nr);
    const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);

@@ -298,8 +383,17 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
    memcpy(&sections.v,  (int32_t *) dst->op_params + 11, sizeof(int)*4);

    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;

+    if (is_mrope) {
+        GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne00/2);
+    }
+
    const int32_t * pos = (const int32_t *) dst->src[1]->data;

    const float * freq_factors = nullptr;
@@ -326,6 +420,19 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
        } else {
            GGML_ABORT("fatal error");
        }
+    } else if (is_mrope && !is_vision) {
+        GGML_SYCL_DEBUG("%s: mrope path\n", __func__);
+        if (dst->src[0]->type == GGML_TYPE_F16) {
+            rope_multi_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01,
+                s02, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                freq_factors, sections, main_stream);
+        } else if (dst->src[0]->type == GGML_TYPE_F32) {
+            rope_multi_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, ne02, s01, s02, n_dims,
+                             nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections,
+                             main_stream);
+        } else {
+            GGML_ABORT("Fatal error: Tensor type unsupported!");
+        }
    } else if (is_vision) {
        GGML_SYCL_DEBUG("%s: vision path\n", __func__);
        if (dst->src[0]->type == GGML_TYPE_F16) {
@@ -285,21 +285,21 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
    }

    __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
-                     const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
+                     const int8_t* q8_1_quant_ptr, const sycl::half2* q8_1_ds, const int & iqs, int /* nblocks */) {
        const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
        const ggml_half d     = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
        int             v[q4_0_traits::vdr_mmvq];
        int             u[2 * q4_0_traits::vdr_mmvq];

-#pragma unroll

+#pragma unroll
        for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
            v[i]         = get_int_from_uint8(bq4_0, iqs + i);
-            u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-            u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi);
+            u[2 * i + 0] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
+            u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi);
        }

-        return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds);
+        return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds);
    };
 };

@@ -347,7 +347,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
    using q4_k_traits = typename q4_k_block::traits;

    float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
-                     const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
+                     const int8_t* q8_1_quant_ptr, const sycl::half2* q8_1_ds, const int & iqs, int nblocks) {
        const int ib = ibx_offset / (QK_K / 2);

        const uint8_t *    base           = static_cast<const uint8_t *>(vbq);
@@ -360,7 +360,38 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
        const int *      q4         = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
        const uint16_t * scales     = (const uint16_t *) scs;

-        return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
+        int   v[2];
+        int   u[2 * QR4_K];
+        float d8[QR4_K];
+
+        v[0] = q4[0];
+        v[1] = q4[4];
+
+        uint16_t  aux[2];
+        const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
+        if (j < 2) {
+            aux[0] = scales[j + 0] & 0x3f3f;
+            aux[1] = scales[j + 2] & 0x3f3f;
+        } else {
+            aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
+            aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
+        }
+
+        const uint8_t * sc = (const uint8_t *) aux;
+        const uint8_t * m  = sc + 2;
+
+        for (int i = 0; i < QR4_K; ++i) {
+            const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
+            sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
+
+            d8[i]                   = ds_values[0];
+
+            const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
+            u[2 * i + 0]   = q8[0];
+            u[2 * i + 1]   = q8[4];
+        }
+
+        return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8);
    }
 };

@@ -1652,7 +1652,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
        return {64, 32};
    }
    return {64, 64};
-};
+}

 static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {

@@ -133,7 +133,7 @@ static void ggml_print_backtrace_symbols(void) {
 }
 #endif

-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
    if (GGML_NO_BACKTRACE) {
        return;
@@ -160,6 +160,10 @@ static void ggml_print_backtrace(void) {
    const int parent_pid = getpid();
    const int child_pid = fork();
    if (child_pid < 0) { // error
+#if defined(__linux__)
+        close(lock[1]);
+        close(lock[0]);
+#endif
        return;
    } else if (child_pid == 0) { // child
        char attach[32];
@@ -167,6 +171,7 @@ static void ggml_print_backtrace(void) {
 #if defined(__linux__)
        close(lock[1]);
        (void) !read(lock[0], lock, 1);
+        close(lock[0]);
 #endif
        // try gdb
        execlp("gdb", "gdb", "--batch",
@@ -195,7 +200,7 @@ static void ggml_print_backtrace(void) {
    }
 }
 #else
-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
    // platform not supported
 }
 #endif
@@ -216,6 +221,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
    abort();
 }

+// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
+
 //
 // logging
 //
@@ -0,0 +1,26 @@
+#include "ggml-impl.h"
+
+#include <cstdlib>
+#include <exception>
+
+static std::terminate_handler previous_terminate_handler;
+
+GGML_NORETURN static void ggml_uncaught_exception() {
+    ggml_print_backtrace();
+    if (previous_terminate_handler) {
+        previous_terminate_handler();
+    }
+    abort(); // unreachable unless previous_terminate_handler was nullptr
+}
+
+static bool ggml_uncaught_exception_init = []{
+    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
+    if (GGML_NO_BACKTRACE) {
+        return false;
+    }
+    const auto prev{std::get_terminate()};
+    GGML_ASSERT(prev != ggml_uncaught_exception);
+    previous_terminate_handler = prev;
+    std::set_terminate(ggml_uncaught_exception);
+    return true;
+}();
@@ -347,11 +347,28 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
    int64_t n_tensors = 0;

    if (ok && gr.read(ctx->version)) {
-        if (ctx->version == 1) {
+        if (ok && ctx->version == 0) {
+            GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
+            ok = false;
+        }
+
+        /*
+         * bit layout is different when reading non-native endian models.
+         * assuming that the GGUF version is 3, the non-native endian model
+         * would read it as 0x30000000. we can use the AND operation against
+         * the last 4 hexadecimal digits to check if the model is the same
+         * endianness as the host system.
+        */
+        if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
+            GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
+            ok = false;
+        }
+
+        if (ok && ctx->version == 1) {
            GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
            ok = false;
        }
-        if (ctx->version > GGUF_VERSION) {
+        if (ok && ctx->version > GGUF_VERSION) {
            GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
                __func__, ctx->version, GGUF_VERSION);
            ok = false;
@@ -1036,6 +1036,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.ATTN_OUT_NORM,
+        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
@@ -49,6 +49,7 @@ class TensorInfo:
 class GGUFValue:
    value: Any
    type: GGUFValueType
+    sub_type: GGUFValueType | None = None


 class WriterState(Enum):
@@ -238,7 +239,7 @@ class GGUFWriter:

            for key, val in kv_data.items():
                kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
-                kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
+                kv_bytes += self._pack_val(val.value, val.type, add_vtype=True, sub_type=val.sub_type)

            fout.write(kv_bytes)

@@ -268,11 +269,11 @@ class GGUFWriter:
            fout.flush()
        self.state = WriterState.TI_DATA

-    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
+    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
        if any(key in kv_data for kv_data in self.kv_data):
            raise ValueError(f'Duplicated key name {key!r}')

-        self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
+        self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)

    def add_uint8(self, key: str, val: int) -> None:
        self.add_key_value(key,val, GGUFValueType.UINT8)
@@ -1022,7 +1023,7 @@ class GGUFWriter:
            pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
        return struct.pack(f'{pack_prefix}{fmt}', value)

-    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
+    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool, sub_type: GGUFValueType | None = None) -> bytes:
        kv_data = bytearray()

        if add_vtype:
@@ -1043,7 +1044,9 @@ class GGUFWriter:
            if len(val) == 0:
                raise ValueError("Invalid GGUF metadata array. Empty array")

-            if isinstance(val, bytes):
+            if sub_type is not None:
+                ltype = sub_type
+            elif isinstance(val, bytes):
                ltype = GGUFValueType.UINT8
            else:
                ltype = GGUFValueType.get_type(val[0])
@@ -1521,19 +1521,21 @@ class GGUFEditorWindow(QMainWindow):
                    continue

                # Apply changes if any
+                sub_type = None
                if field.name in self.metadata_changes:
                    value_type, value = self.metadata_changes[field.name]
                    if value_type == GGUFValueType.ARRAY:
                        # Handle array values
-                        element_type, array_values = value
-                        writer.add_array(field.name, array_values)
-                    else:
-                        writer.add_key_value(field.name, value, value_type)
+                        sub_type, value = value
                else:
                    # Copy original value
                    value = field.contents()
-                    if value is not None and field.types:
-                        writer.add_key_value(field.name, value, field.types[0])
+                    value_type = field.types[0]
+                    if value_type == GGUFValueType.ARRAY:
+                        sub_type = field.types[-1]
+
+                if value is not None:
+                    writer.add_key_value(field.name, value, value_type, sub_type=sub_type)

            # Add new metadata
            for key, (value_type, value) in self.metadata_changes.items():
@@ -1541,7 +1543,12 @@ class GGUFEditorWindow(QMainWindow):
                if self.reader.get_field(key) is not None:
                    continue

-                writer.add_key_value(key, value, value_type)
+                sub_type = None
+                if value_type == GGUFValueType.ARRAY:
+                    # Handle array values
+                    sub_type, value = value
+
+                writer.add_key_value(key, value, value_type, sub_type=sub_type)

            # Add tensors (including data)
            for tensor in self.reader.tensors:
@@ -24,6 +24,7 @@ class MetadataDetails(NamedTuple):
    type: gguf.GGUFValueType
    value: Any
    description: str = ''
+    sub_type: gguf.GGUFValueType | None = None


 def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
@@ -57,7 +58,9 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
            logger.debug(f'Removing {field.name}')
            continue

-        old_val = MetadataDetails(field.types[0], field.contents())
+        val_type = field.types[0]
+        sub_type = field.types[-1] if val_type == gguf.GGUFValueType.ARRAY else None
+        old_val = MetadataDetails(val_type, field.contents(), sub_type=sub_type)
        val = new_metadata.get(field.name, old_val)

        if field.name in new_metadata:
@@ -67,7 +70,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
            logger.debug(f'Copying {field.name}')

        if val.value is not None:
-            writer.add_key_value(field.name, val.value, val.type)
+            writer.add_key_value(field.name, val.value, val.type, sub_type=sub_type if val.sub_type is None else val.sub_type)

    if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
        logger.debug('Adding chat template(s)')
@@ -157,6 +157,7 @@ class TensorNameMap:
            "h.{bid}.attn.c_attn",                                                 # gpt2
            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
            "encoder.layers.{bid}.attn.Wqkv",                                      # nomic-bert
+            "encoder.layers.{bid}.mixer.Wqkv",                                     # jina
            "model.layers.{bid}.self_attn.qkv_proj",                               # phi3
            "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
            "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
@@ -168,6 +169,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.q_proj_no_perm",               # llama-custom
            "layers.{bid}.attention.wq",                                 # llama-pth
            "encoder.layer.{bid}.attention.self.query",                  # bert
+            "transformer.layer.{bid}.attention.q_lin",                   # distillbert
            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
            "model.layers.{bid}.attention.wq",                           # internlm2
@@ -182,6 +184,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.k_proj_no_perm",             # llama-custom
            "layers.{bid}.attention.wk",                               # llama-pth
            "encoder.layer.{bid}.attention.self.key",                  # bert
+            "transformer.layer.{bid}.attention.k_lin",                 # distillbert
            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
            "transformer.h.{bid}.attn.k",                              # refact
            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
@@ -196,6 +199,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
            "layers.{bid}.attention.wv",                                 # llama-pth
            "encoder.layer.{bid}.attention.self.value",                  # bert
+            "transformer.layer.{bid}.attention.v_lin",                   # distillbert
            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
            "transformer.h.{bid}.attn.v",                                # refact
            "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
@@ -216,6 +220,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.linear_attn",                     # deci
            "layers.{bid}.attention.wo",                                    # llama-pth
            "encoder.layer.{bid}.attention.output.dense",                   # bert
+            "transformer.layer.{bid}.attention.out_lin",                    # distillbert
            "transformer.h.{bid}.attn.out_proj",                            # gpt-j
            "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
            "model.layers.{bid}.self_attn.dense",                           # persimmon
@@ -224,6 +229,7 @@ class TensorNameMap:
            "model.layers.layers.{bid}.self_attn.o_proj",                   # plamo
            "model.layers.{bid}.attention.wo",                              # internlm2
            "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
+            "encoder.layers.{bid}.mixer.out_proj",                          # jina
            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
            "encoder.layers.{bid}.self_attention.dense",                    # chatglm
@@ -235,6 +241,7 @@ class TensorNameMap:
        # Attention output norm
        MODEL_TENSOR.ATTN_OUT_NORM: (
            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
+            "transformer.layer.{bid}.sa_layer_norm",           # distillbert
            "encoder.layers.{bid}.norm1",                      # nomic-bert
            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
@@ -311,6 +318,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
            "layers.{bid}.feed_forward.w3",                           # llama-pth
            "encoder.layer.{bid}.intermediate.dense",                 # bert
+            "transformer.layer.{bid}.ffn.lin1",                       # distillbert
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
            "transformer.h.{bid}.mlp.linear_3",                       # refact
            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
@@ -394,6 +402,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
            "layers.{bid}.feed_forward.w2",                           # llama-pth
            "encoder.layer.{bid}.output.dense",                       # bert
+            "transformer.layer.{bid}.ffn.lin2",                       # distillbert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
            "model.layers.{bid}.mlp.dense_4h_to_h",                   # persimmon
@@ -455,6 +464,7 @@ class TensorNameMap:

        MODEL_TENSOR.LAYER_OUT_NORM: (
            "encoder.layer.{bid}.output.LayerNorm",         # bert
+            "transformer.layer.{bid}.output_layer_norm",    # distillbert
            "encoder.layers.{bid}.norm2",                   # nomic-bert
            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
            "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
@@ -825,6 +835,7 @@ class TensorNameMap:
        MODEL_TENSOR.CLS: (
            "classifier",       # jina
            "classifier.dense", # roberta
+            "pre_classifier",   # distillbert
        ),

        MODEL_TENSOR.CLS_OUT: (
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.16.3"
+version = "0.17.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
@@ -259,9 +259,9 @@ extern "C" {
        llama_token  *  token;
        float        *  embd;
        llama_pos    *  pos;
-        int32_t      *  n_seq_id;
-        llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
+        int32_t      *  n_seq_id; // TODO: remove, should belong to only 1 sequence
+        llama_seq_id ** seq_id;   // TODO: become llama_seq_id * seq_id;
+        int8_t       *  logits;   // TODO: rename this to "output"
    } llama_batch;

    enum llama_model_kv_override_type {
@@ -366,6 +366,8 @@ extern "C" {
        bool no_perf;     // measure performance timings
        bool op_offload;  // offload host tensor operations to device
        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
    };

    // model quantization parameters
@@ -502,6 +504,7 @@ extern "C" {
    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -652,7 +655,6 @@ extern "C" {
    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_self_seq_add(
@@ -665,7 +667,6 @@ extern "C" {
    // Integer division of the positions by factor of `d > 1`
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_self_seq_div(
@@ -677,12 +678,14 @@ extern "C" {

    // Returns the smallest position present in the KV cache for the specified sequence
    // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
    // Return -1 if the sequence is empty
    LLAMA_API llama_pos llama_kv_self_seq_pos_min(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);

    // Returns the largest position present in the KV cache for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
    // Return -1 if the sequence is empty
    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
            struct llama_context * ctx,
@@ -691,14 +694,15 @@ extern "C" {
    // Defragment the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
-    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
+            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");

    // Check if the context supports KV cache shifting
    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);

    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
+            "simply remove this call, updates are applied lazily on the next llama_decode()");

    //
    // State / sessions
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 29464 2094 1018 1092 2706
- 11865 17875
+ 9706 7959 2140



@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
@@ -1,46 +0,0 @@
- 17245 16604 16403 16604 33583 18355
- 16421 51153
-
- 16604
- 16650
- 16650 16604
- 16581
- 16582
- 16582 16582
- 16582 16582 16582
- 16581 16582
- 31596 17394
- 34926 17394
- 31596 18671
- 34926 18671
- 34926 18671 16384
- 31596 16395 17394 16384
- 34926 16395 17394 16384
- 16811 16704 20410 16483 16631 16397 52854
- 16470 16399 16403 16407 16604 16406 35764 38185 51595 22592 26639
- 29479 23955 17012 20103 25527 27670 17408 19005 21473 24774
- 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 21954 16607 21954 16633 21954 16611 29409 16607 21954 16615
- 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 16604 16391 24664 17153 57169 16721 16872 17073 17304 28729 16392
- 31596
- 34926
- 16650 31596
- 16650 34926
- 16696 31596
- 16696 31596 16582 16696 31596
- 16604 16391
- 16582 16604 16412
- 16390 22623
- 31596 16395 16712 16390 16828 16384 17674 16769 16732 23686 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636
- 16384 16384 16384 16384 16384 16384
- 16402
- 16402 16402
- 16402 16402 16402
- 16402 16402 16402 16402
- 16402 16402 16402 16402 16402
- 16402 16402 16402 16402 16402 16402
- 16402 16402 16402 16402 16402 16402 16402
- 16402 16402 16402 16402 16402 16402 16402 16402
- 16402 16402 16402 16402 16402 16402 16402 16402 16402
- 16418 19038 16639 16448 24315 33727 16467
- 18765 17981
- 16582 16604 16582 16582 16604 16582 16582 16582 16604 16581 16604 16581 16581 16604 16581 16582 16650 16582 16650 16604 16582 16696 16582 16696 16604 16582 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 20410 16483 16631 18885 16483 16631 16604 16402 16604 16402 16402 16604 16402 16402 16402 16604 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16402 16604 16402 16397 16402 16604 16402 16397 16397 16402 16604 16402 16397 16397 16397 16402 16604 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 27683 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636 16604 16396 16396 16396 16396 16396 16396 16412 16412 16412 16412 16412 16412 16412 27268 23955 17012 20103 25527 27670 17408 19005 21473 24774 16604 16390 16390 16390 16390 16390 16390 16447 16447 16447 16447 16447 16447 16447 16385 16385 16385 16385 16397 16397 16397 16397 16397 16397 16384 16384 16384 16384 16384 16384 16414 16414 16414 16414 16414 16414 16687 16390 16690 16992 16604 16390 61797 16733 16390 16466 16986 16395 16604 16390 17879 16732 17811 16414 16604 16390 16428 16804 17811 16687 16390 16683 17190 16728 16395 16604 16390 16419 16732 16945 16991 25251 16414 17119 16390 38127 16641 16390 16459 16427
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 2536 228 27 228 22957 6983
- 45 193433
+ 90711 87 20910

 228
 1667
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 1050 207 19 207 19192 4217
- 37 32009 71 6247
+ 125 213 26862 282

 207
 243
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 1052 207 19 207 19109 4223
- 37 100014 71 6245
+ 82077 26723 282

 207
 243
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
@@ -1,46 +0,0 @@
- 1122 220 19 220 26062 3951
- 37 50753 261
-
- 220
- 256
- 262
- 197
- 198
- 271
- 1406
- 1572
- 9707 1879
- 21927 1879
- 9707 4337
- 21927 4337
- 21927 4337 0
- 9707 11 1879 0
- 21927 11 1879 0
- 419 374 11162 99 247 13 10821
- 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
- 78762 14144 1456 13073 63471 33594 3038 133178 79012
- 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
- 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
- 9707
- 21927
- 220 21927
- 256 21927
- 262 21927
- 262 21927 198 262 21927
- 320
- 198 284
- 6 11385
- 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
- 17085 2928
- 18
- 18 18
- 18 18 18
- 18 18 18 18
- 18 18 18 18 18
- 18 18 18 18 18 18
- 18 18 18 18 18 18 18
- 18 18 18 18 18 18 18 18
- 18 18 18 18 18 18 18 18 18
- 34 90063 128324
- 2560 2347
- 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 878 204 31 3068 133 2137
- 28611 132 30042
+ 34502 18614 286

 204
 258
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 798 604 25208 1933
- 37 9116 71 11751
+ 127 226 79 69 417

 220
 220 220
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
@@ -1,46 +0,0 @@
- 1165 220 19 220 27124 5503
- 37 19194 259
-
- 220
- 256
- 271
- 197
- 198
- 279
- 2499
- 2775
- 13225 2375
- 32949 2375
- 13225 5922
- 32949 5922
- 32949 5922 0
- 13225 11 2375 0
- 32949 11 2375 0
- 495 382 9552 99 247 13 17159
- 86 45404 220 22 10191 2852 22924 4750 6916
- 3907 53641 1235 185386 8118
- 11400 107516 15867 20804 22851 134178 77431 32010 104312 37984 16329 27751 89335
- 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 350 7393 74471 484 853 1617 2316 6602 8
- 13225
- 32949
- 220 32949
- 256 32949
- 271 32949
- 271 32949 198 271 32949
- 350
- 198 314
- 6 6837
- 13225 11 342 70653 0 3253 553 481 22861 223 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208
- 147475
- 18
- 2546
- 15517
- 15517 18
- 15517 2546
- 15517 15517
- 15517 15517 18
- 15517 15517 2546
- 15517 15517 15517
- 34 60213 53904
- 2960 3098
- 126470 25980 160432 16609 2775 4066 172261 19432 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 9552 99 247 4103 99 247 220 18 220 2546 220 15517 220 15517 18 220 15517 2546 220 15517 15517 220 15517 15517 18 220 15517 15517 2546 220 18 13 18 220 18 485 18 220 18 1008 18 44735 107516 15867 20804 22851 134178 77431 32010 104312 156437 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208 105024 106657 1967 53641 1235 185386 8118 22434 39336 26178 26178 168394 194663 27271 147475 25883 6961 9790 1339 461 83 1280 19016 1354 11 461 1099 481 3239 30 461 44 625 3239 17291 1520 480 11 461 35 481 1299 1236 17966 30 1416 6 27493 261 54602 43
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 1142 220 19 220 27154 4038
- 37 51853 261
+ 88075 16276 301

 220
 256
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 474 287 29871 29946 29871 30226 7378
- 383 4000 261
+ 11585 7810 295

 259
 1678
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
@@ -1,46 +0,0 @@
- 1190 220 32 220 18215 7112
- 50 16800 258
-
- 220
- 256
- 277
- 197
- 198
- 368
- 2946
- 3271
- 19873 3817
- 39715 3817
- 19873 7353
- 39715 7353
- 39715 7353 13
- 19873 24 3817 13
- 39715 24 3817 13
- 544 373 9522 112 247 26 36315
- 99 39923 220 35 9607 21498 21470 3679 9433
- 1595 7653 633 79829 34051 1636
- 8755 102595 115960 21125 148305 96819 102816 39048 14105 22528 160234
- 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 330 7384 88230 511 947 1492 3742 7233 21
- 19873
- 39715
- 220 39715
- 256 39715
- 277 39715
- 277 39715 198 277 39715
- 330
- 198 319
- 19 7359
- 19873 24 386 87799 13 2403 583 650 51358 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645
- 17931 4959
- 31
- 1922
- 12325
- 12325 31
- 12325 1922
- 12325 12325
- 12325 12325 31
- 12325 12325 1922
- 12325 12325 12325
- 47 19811 12077
- 3260 3579
- 198 7283 51499 191231 20192 3271 3322 9287 2143 17860 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 9522 112 247 172394 247 220 31 220 1922 220 12325 220 12325 31 220 12325 1922 220 12325 12325 220 12325 12325 31 220 12325 12325 1922 220 31 26 31 220 31 396 31 220 31 1043 31 117131 102595 115960 21125 148305 96819 102816 80883 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645 79745 150278 117079 633 79829 34051 1636 25611 41990 109428 1488 91054 24072 17931 4959 29795 9296 16517 1806 481 96 1386 36633 1609 24 481 1109 650 5074 43 481 57 702 5074 27088 2170 536 24 481 48 650 1933 1696 30262 43 1665 19 32818 262 27236 56
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 728 577 24142 2607
- 39 26288 6554
+ 37515 18569 293

 209
 50276
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
@@ -1,46 +0,0 @@
- 17 297 201 78660 21775
- 72805 4097 56
-
-
-
-
-
-
-
-
-
- 35378 8999
- 35378 8999
- 35378 6661
- 35378 6661
- 35378 6661 38
- 35378 4 8999 38
- 35378 4 8999 38
- 903 83 6 3 5 238 6366
- 148 7709 1019 361 458 134362 104 7 71 420 1132
- 14271 29 117152
- 6 149561 78270 48967 64254 7616 81705
- 6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 15 191 538 28 121505 450 1556 6863 10002 47 1098 16
- 35378
- 35378
- 35378
- 35378
- 35378
- 35378 35378
- 15
- 2203
- 242 1615
- 35378 4 113 25 5584 38 11249 621 398 6 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087
- 6 90827
- 138
- 3912
- 6 66000
- 138 66000
- 3912 66000
- 6 66000 66000
- 138 66000 66000
- 3912 66000 66000
- 6 66000 66000 66000
- 199152 3763
- 17116 99397
- 6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 6 3 138 3912 6 66000 138 66000 3912 66000 6 66000 66000 138 66000 66000 3912 66000 66000 80308 1031 5 363 138 27 363 6 149561 78270 48967 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087 6 110405 1369 69112 69112 69112 14271 29 117152 5106 4765 4765 1135 164721 164721 164721 58 58 58 58 2551 90827 32 85908 87 25 272 2809 242 18 18345 764 25 7 2685 4 242 11766 398 9077 32 242 594 959 9077 87 25 1181 3249 442 4 242 397 398 1884 3060 26156 32 1401 25 26455 10 25 141 866
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 474 287 29871 29946 29871 30226 7378
- 383 4000 261
+ 11585 7810 295

 259
 1678
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
@@ -1,46 +0,0 @@
- 2014 1032 1052 1032 28504 6972
- 1070 7088 1258
-
- 1032
- 1256
- 1293
- 1009
- 1010
- 1267
- 4688
- 1009 1010
- 22177 4304
- 45383 4304
- 22177 5325
- 45383 5325
- 45383 5325 1033
- 22177 1044 4304 1033
- 45383 1044 4304 1033
- 1593 1395 119685 1166 1153 1046 51228
- 1119 1048 1052 1056 1032 1055 17391 23216 30203 7785 17279
- 3337 30757 1902 4200 63073 3671
- 1225 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1225 1158 1129 1225 1158 1155 1225 1158 1133 1225 21359 1225 1158 1137
- 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 1319 11234 1873 26303 1455 1934 2246 3754 10835 1041
- 22177
- 45383
- 1032 45383
- 1256 45383
- 1293 45383
- 1293 45383 1010 1293 45383
- 1319
- 1010 1376
- 1039 4033
- 22177 1044 1404 48054 1033 3075 1584 1636 119685 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749
- 7290 7290 7290
- 1051
- 1051 1051
- 1051 1051 1051
- 1051 1051 1051 1051
- 1051 1051 1051 1051 1051
- 1051 1051 1051 1051 1051 1051
- 1051 1051 1051 1051 1051 1051 1051
- 1051 1051 1051 1051 1051 1051 1051 1051
- 1051 1051 1051 1051 1051 1051 1051 1051 1051
- 1067 59503 28783
- 3724 4058
- 1010 1032 1267 1032 4688 1032 17152 1458 29356 1010 1256 1010 1293 1010 1260 1010 1652 1010 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 119685 1166 1153 1240 1159 1166 1153 1032 1051 1032 1051 1051 1032 1051 1051 1051 1032 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1051 1032 1051 1046 1051 1032 1051 1791 1051 1032 1051 2880 1051 71881 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1240 1159 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749 45577 1045 6626 43555 2843 30757 1902 4200 63073 3671 14931 20040 20040 1657 1657 1975 14135 14135 83923 7290 7290 7290 45509 45509 45509 1362 6483 2151 1576 1116 2189 1514 1681 2156 1044 1576 3609 1636 5257 1063 1576 1077 1605 5257 1362 7534 3180 1494 1044 1576 1068 1636 2479 2269 26883 1063 2837 1039 45654 1261 54297 1076
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 1122 220 19 220 26062 3951
- 37 50753 261
+ 86975 15897 301

 220
 256
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 4833 225 38 225 143 140 17723
- 56 2006 3935 265
+ 144 231 7132 342

 225
 261
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
@@ -1,46 +0,0 @@
- 2550 204 18430 377
- 597 2768 298 8564
-
- 1437
- 1437 1437
- 1437 1437 1437
- 50117
- 50118
- 50140
- 50140 50118
- 50117 50118
- 31414 232
- 20920 232
- 31414 623
- 20920 623
- 20920 623 328
- 31414 6 232 328
- 20920 6 232 328
- 42 16 8103 18164 27 4 49317
- 605 40976 262 10109 18474 385 29 36807 6455
- 36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328
- 1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171
- 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43
- 31414
- 20920
- 1437 20920
- 1437 1437 20920
- 1437 1437 1437 20920
- 1437 1437 1437 20920 50118 1437 1437 1437 20920
- 36
- 50118 5457
- 108 3567
- 31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772
- 32376 12846
- 246
- 3103
- 25631
- 46152
- 3103 25631
- 46152 3103
- 46152 25631
- 46152 46152
- 46152 3103 25631
- 347 1376 2023 12410 102 16376 1376 2023 6382 90
- 9553 5954
- 50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__

 __ggml_vocab_test__
@@ -1,5 +1,5 @@
 4850 244 57 244 162 159 17722
- 75 2022 3943 284
+ 163 250 7146 361

 244
 280
@@ -1,3 +1,3 @@
 numpy~=1.26.4
 PySide6~=6.9.0
-gguf>=0.16.0
+gguf>=0.17.0
@@ -1 +1 @@
-06b715f4c170232af261425240914fa49c44f982
+94a83ba5a725ae2aee79df75dd99b2119d0478cc
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+import urllib.request
+
+vendor = {
+    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
+    "https://github.com/nlohmann/json/releases/latest/download/json_fwd.hpp": "vendor/nlohmann/json_fwd.hpp",
+
+    # sync manually
+    # "https://raw.githubusercontent.com/ochafik/minja/refs/heads/main/include/minja/minja.hpp":         "vendor/minja/minja.hpp",
+    # "https://raw.githubusercontent.com/ochafik/minja/refs/heads/main/include/minja/chat-template.hpp": "vendor/minja/chat-template.hpp",
+
+    "https://raw.githubusercontent.com/nothings/stb/refs/heads/master/stb_image.h": "vendor/stb/stb_image.h",
+
+    "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.22/miniaudio.h": "vendor/miniaudio/miniaudio.h",
+
+    "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.20.1/httplib.h": "vendor/cpp-httplib/httplib.h",
+}
+
+for url, filename in vendor.items():
+    print(f"downloading {url} to {filename}") # noqa: NP100
+    urllib.request.urlretrieve(url, filename)
@@ -21,6 +21,9 @@ add_library(llama
            llama-impl.cpp
            llama-io.cpp
            llama-kv-cache.cpp
+            llama-kv-cache-unified.cpp
+            llama-kv-cache-unified-iswa.cpp
+            llama-kv-cache-recurrent.cpp
            llama-memory.cpp
            llama-mmap.cpp
            llama-model-loader.cpp
@@ -450,6 +450,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
            { LLM_TENSOR_POS_EMBD,        "position_embd" },
            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
@@ -15,24 +15,31 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
            break;
        }
    }
-    ubatch_token.resize(!has_embd ? n_ubatch : 0);
-    ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
-    ubatch_pos.resize(n_ubatch);
-    ubatch_n_seq_id.resize(n_ubatch);
-    ubatch_seq_id.resize(n_ubatch);
-    ubatch_output.resize(n_ubatch);
+
+    udatas.push_back({});
+
+    auto & udata = udatas.back();
+
+    udata.token.resize(!has_embd ? n_ubatch : 0);
+    udata.embd.resize(has_embd ? n_embd * n_ubatch : 0);
+    udata.pos.resize(n_ubatch);
+    udata.n_seq_id.resize(n_ubatch);
+    udata.seq_id.resize(n_ubatch);
+    udata.output.resize(n_ubatch);
+
    llama_ubatch ubatch = {
        /*equal_seqs   =*/ true,
        /*n_tokens     =*/ 0,
        /*n_seq_tokens =*/ 0,
        /*n_seqs       =*/ 0,
-        /*token        =*/ !has_embd ? ubatch_token.data() : nullptr,
-        /*embd         =*/ has_embd  ? ubatch_embd.data()  : nullptr,
-        /*pos          =*/ ubatch_pos.data(),
-        /*n_seq_id     =*/ ubatch_n_seq_id.data(),
-        /*seq_id       =*/ ubatch_seq_id.data(),
-        /*output       =*/ ubatch_output.data(),
+        /*token        =*/ !has_embd ? udata.token.data() : nullptr,
+        /*embd         =*/ has_embd  ? udata.embd.data()  : nullptr,
+        /*pos          =*/ udata.pos.data(),
+        /*n_seq_id     =*/ udata.n_seq_id.data(),
+        /*seq_id       =*/ udata.seq_id.data(),
+        /*output       =*/ udata.output.data(),
    };
+
    return ubatch;
 }

@@ -11,15 +11,15 @@ struct llama_ubatch {
    bool equal_seqs;
    // TODO: whole_seqs for embeddings?

-    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
    uint32_t n_seq_tokens; // tokens per sequence
    uint32_t n_seqs;

    llama_token  *  token;    // [n_tokens]
    float        *  embd;     // [n_embd, n_tokens]
    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
+    int32_t      *  n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
+    llama_seq_id ** seq_id;   // [n_seqs] // TODO: become llama_seq_id * seq_id;
    int8_t       *  output;   // [n_tokens]
 };

@@ -49,13 +49,18 @@ struct llama_sbatch {

    const llama_batch * batch = nullptr;

-    // buffers for the ubatch
-    std::vector<llama_token>    ubatch_token;
-    std::vector<float>          ubatch_embd;
-    std::vector<llama_pos>      ubatch_pos;
-    std::vector<int32_t>        ubatch_n_seq_id;
-    std::vector<llama_seq_id *> ubatch_seq_id;
-    std::vector<int8_t>         ubatch_output;
+    // buffers for the ubatches
+    // TODO: very hacky, this needs a complete rework
+    struct ubatch_data {
+        std::vector<llama_token>    token;
+        std::vector<float>          embd;
+        std::vector<llama_pos>      pos;
+        std::vector<int32_t>        n_seq_id;
+        std::vector<llama_seq_id *> seq_id;
+        std::vector<int8_t>         output;
+    };
+
+    std::vector<ubatch_data> udatas;

    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);

@@ -6,9 +6,10 @@
 #include "llama-model.h"
 #include "llama-kv-cache.h"

-#include <cstring>
-#include <stdexcept>
 #include <cinttypes>
+#include <cstring>
+#include <limits>
+#include <stdexcept>

 //
 // llama_context
@@ -122,6 +123,11 @@ llama_context::llama_context(
                __func__, n_ctx_per_seq, hparams.n_ctx_train);
    }

+    if (!params.swa_full && cparams.n_seq_max > 1) {
+        LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
+                __func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
+    }
+
    if (!hparams.vocab_only) {
        // GPU backends
        for (auto * dev : model.devices) {
@@ -259,15 +265,9 @@ llama_context::llama_context(

    // reserve worst-case graph
    if (!hparams.vocab_only && memory) {
-        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        const uint32_t n_seqs = cparams.n_seq_max;
        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-
-        // restore later
-        // TODO: something cleaner
-        const auto n_outputs_save = n_outputs;
-
        LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);

        int n_splits_pp = -1;
@@ -279,23 +279,17 @@ llama_context::llama_context(
        // simulate full KV cache
        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());

-        kv_self->set_full();
+        const auto kv_state = kv_self->init_full();
+        if (!kv_state) {
+            throw std::runtime_error("failed to initialize KV cache");
+        }

        cross.v_embd.clear();

        // reserve pp graph first so that buffers are only allocated once
        {
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            // max number of outputs
-            n_outputs = ubatch_pp.n_tokens;
-
-            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
-
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
-
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
+            if (!gf) {
                throw std::runtime_error("failed to allocate compute pp buffers");
            }

@@ -305,16 +299,8 @@ llama_context::llama_context(

        // reserve with tg graph to get the number of splits and nodes
        {
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            n_outputs = ubatch_tg.n_tokens;
-
-            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_tg.n_tokens, ubatch_tg.n_seqs);
-
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_tg, LLM_GRAPH_TYPE_DEFAULT);
-
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            auto * gf = graph_reserve(1, 1, 1, kv_state.get());
+            if (!gf) {
                throw std::runtime_error("failed to allocate compute tg buffers");
            }

@@ -324,22 +310,12 @@ llama_context::llama_context(

        // reserve again with pp graph to avoid ggml-alloc reallocations during inference
        {
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            n_outputs = ubatch_pp.n_tokens;
-
-            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
-
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
-
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
+            if (!gf) {
                throw std::runtime_error("failed to allocate compute pp buffers");
            }
        }

-        n_outputs = n_outputs_save;
-
        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
            ggml_backend_t             backend = backend_ptrs[i];
            ggml_backend_buffer_type_t buft    = backend_buft[i];
@@ -453,36 +429,33 @@ const llama_kv_cache * llama_context::get_kv_self() const {
    return kv_self;
 }

-void llama_context::kv_self_update() {
-    bool need_reserve = false;
+bool llama_context::kv_self_update() {
+    if (!memory) {
+        return false;
+    }

    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());

-    need_reserve = kv_self->update(*this);
-
-    // reserve a worst case graph if needed
-    if (need_reserve) {
-        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-        // simulate full KV cache
-        kv_self->set_full();
-
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-        auto * gf = graph_init();
-        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(sched.get());
-        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
+    if (!kv_self->update(*this)) {
+        // no updates have been performed
+        return false;
    }
+
+    // if the KV cache did any computation, we have to reserve a new worst-case graph
+    const auto kv_state = kv_self->init_full();
+    if (!kv_state) {
+        throw std::runtime_error("failed to initialize KV cache");
+    }
+
+    const uint32_t n_seqs   = cparams.n_seq_max;
+    const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+    auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
+    if (!gf) {
+        LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
+    }
+
+    return true;
 }

 enum llama_pooling_type llama_context::pooling_type() const {
@@ -676,6 +649,49 @@ bool llama_context::apply_adapter_cvec(
    return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }

+llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_state_i * mstate, ggml_status & ret) {
+    if (mstate && !mstate->apply()) {
+        LLAMA_LOG_ERROR("%s: failed to apply memory state\n", __func__);
+        ret = GGML_STATUS_FAILED;
+        return nullptr;
+    }
+
+    auto * gf = graph_init();
+    if (!gf) {
+        LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
+        ret = GGML_STATUS_FAILED;
+        return nullptr;
+    }
+
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mstate);
+    if (!res) {
+        LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
+        ret = GGML_STATUS_FAILED;
+        return nullptr;
+    }
+
+    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+    if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
+        LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
+        ret = GGML_STATUS_ALLOC_FAILED;
+        return nullptr;
+    }
+
+    res->set_inputs(&ubatch);
+
+    const auto status = graph_compute(gf, ubatch.n_tokens > 1);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
+        ret = status;
+        return nullptr;
+    }
+
+    ret = GGML_STATUS_SUCCESS;
+
+    return res;
+}
+
 int llama_context::encode(llama_batch & inp_batch) {
    if (inp_batch.n_tokens == 0) {
        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -737,8 +753,6 @@ int llama_context::encode(llama_batch & inp_batch) {

    n_outputs = n_tokens;

-    //batch_manager->prepare(ubatch);
-
    ggml_backend_sched_reset(sched.get());
    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);

@@ -749,26 +763,18 @@ int llama_context::encode(llama_batch & inp_batch) {
    //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
    cparams.causal_attn = false;

-    auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);
-
-    ggml_backend_sched_alloc_graph(sched.get(), gf);
-
-    res->set_inputs(&ubatch);
+    ggml_status status;
+    const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);

    cparams.causal_attn = causal_attn_org;

-    const auto compute_status = graph_compute(gf, n_tokens > 1);
-    switch (compute_status) {
-        case GGML_STATUS_SUCCESS:
-            break;
-        case GGML_STATUS_ABORTED:
-            return 2;
-        case GGML_STATUS_ALLOC_FAILED:
-            return -2;
-        case GGML_STATUS_FAILED:
-        default:
-            return -3;
+    if (!res) {
+        switch (status) {
+            case GGML_STATUS_ABORTED:      return  2;
+            case GGML_STATUS_ALLOC_FAILED: return -2;
+            case GGML_STATUS_FAILED:       return -3;
+            case GGML_STATUS_SUCCESS:      GGML_ABORT("should not happen");
+        }
    }

    auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
@@ -889,8 +895,6 @@ int llama_context::decode(llama_batch & inp_batch) {
    const int64_t n_tokens_all = batch.n_tokens;
    const int64_t n_embd       = hparams.n_embd;

-    llama_kv_cache_guard kv_guard(kv_self);
-
    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

    // TODO: move the validation to the llama_batch_allocr
@@ -936,7 +940,48 @@ int llama_context::decode(llama_batch & inp_batch) {
        n_outputs_all = 1;
    }

-    llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all);
+    // handle any pending defrags/shifts
+    kv_self_update();
+
+    llama_memory_state_ptr kv_state;
+
+    bool did_defrag = false;
+
+    while (true) {
+        kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
+        if (!kv_state) {
+            return -2;
+        }
+
+        switch (kv_state->get_status()) {
+            case LLAMA_MEMORY_STATUS_SUCCESS:
+                {
+                } break;
+            case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+                {
+                    if (!did_defrag) {
+                        did_defrag = true;
+
+                        kv_self->defrag_sched(-1.0f);
+                        if (kv_self_update()) {
+                            LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens);
+
+                            continue;
+                        }
+                    }
+
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+
+                    return 1;
+                }
+            case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+                {
+                    return -2;
+                }
+        }
+
+        break;
+    }

    // reserve output buffer
    if (output_reserve(n_outputs_all) < n_outputs_all) {
@@ -944,13 +989,10 @@ int llama_context::decode(llama_batch & inp_batch) {
        return -2;
    };

-    // handle any pending defrags/shifts
-    kv_self_update();
-
    int64_t n_outputs_prev = 0;

-    while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
+    do {
+        const auto & ubatch = kv_state->get_ubatch();

        // count the outputs in this u_batch
        {
@@ -969,33 +1011,37 @@ int llama_context::decode(llama_batch & inp_batch) {
            n_outputs = n_outputs_new;
        }

-        // find KV slot
-        if (!kv_self->find_slot(ubatch)) {
-            return 1;
-        }
-
        ggml_backend_sched_reset(sched.get());
        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);

-        auto * gf = graph_init();
-        auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DECODER);
+        ggml_status status;
+        const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, kv_state.get(), status);

-        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+        if (!res) {
+            // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
+            llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES] = { std::numeric_limits<llama_pos>::max() };

-        ggml_backend_sched_alloc_graph(sched.get(), gf);
+            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+                const auto & seq_id = ubatch.seq_id[i][0];

-        res->set_inputs(&ubatch);
+                pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
+            }

-        const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
-        if (compute_status != GGML_STATUS_SUCCESS) {
-            switch (compute_status) {
-                case GGML_STATUS_ABORTED:
-                    return 2;
-                case GGML_STATUS_ALLOC_FAILED:
-                    return -2;
-                case GGML_STATUS_FAILED:
-                default:
-                    return -3;
+            for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+                if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
+                    continue;
+                }
+
+                LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
+
+                llama_kv_self_seq_rm(this, s, pos_min[s], -1);
+            }
+
+            switch (status) {
+                case GGML_STATUS_ABORTED:      return  2;
+                case GGML_STATUS_ALLOC_FAILED: return -2;
+                case GGML_STATUS_FAILED:       return -3;
+                case GGML_STATUS_SUCCESS:      GGML_ABORT("should not happen");
            }
        }

@@ -1082,10 +1128,7 @@ int llama_context::decode(llama_batch & inp_batch) {
        }

        n_outputs_prev += n_outputs;
-    }
-
-    // finalize the batch processing
-    kv_guard.commit();
+    } while (kv_state->next());

    // set to total number of outputs in the batch, for use in llama_get_logits_ith
    n_outputs = n_outputs_all;
@@ -1094,7 +1137,7 @@ int llama_context::decode(llama_batch & inp_batch) {
    {
        bool sorted_output = true;

-        auto & out_ids = sbatch.out_ids;
+        auto & out_ids = kv_state->out_ids();

        GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);

@@ -1254,11 +1297,52 @@ ggml_cgraph * llama_context::graph_init() {
    return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
 }

+ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate) {
+    LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
+
+    if (n_tokens % n_seqs != 0) {
+        n_tokens = (n_tokens / n_seqs) * n_seqs;
+        n_outputs = std::min(n_outputs, n_tokens);
+
+        LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
+    }
+
+    // store the n_outputs as it is, and restore it afterwards
+    // TODO: not sure if needed, might simplify in the future by removing this
+    const auto save_n_outputs = this->n_outputs;
+
+    this->n_outputs = n_outputs;
+
+    llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+    llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+    auto * gf = graph_init();
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate);
+
+    this->n_outputs = save_n_outputs;
+
+    if (!res) {
+        LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
+        return nullptr;
+    }
+
+    ggml_backend_sched_reset(sched.get());
+
+    // initialize scheduler with the specified graph
+    if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+        LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        return nullptr;
+    }
+
+    return gf;
+}
+
 llm_graph_result_ptr llama_context::graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-            llm_graph_type gtype) {
+                    ggml_context * ctx,
+                     ggml_cgraph * gf,
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+      const llama_memory_state_i * mstate) {
    return model.build_graph(
            {
                /*.ctx         =*/ ctx,
@@ -1270,7 +1354,7 @@ llm_graph_result_ptr llama_context::graph_build(
                /*.backend_cpu =*/ backend_cpu,
                /*.cvec        =*/ &cvec,
                /*.loras       =*/ &loras,
-                /*.memory      =*/ memory.get(),
+                /*.mstate      =*/ mstate,
                /*.cross       =*/ &cross,
                /*.n_outputs   =*/ n_outputs,
                /*.cb          =*/ graph_get_cb(),
@@ -1951,7 +2035,6 @@ void llama_context::opt_epoch_iter(
    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());

    kv_self->clear();
-    llama_kv_cache_guard kv_guard(kv_self);

    for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) {
        batch.n_tokens = n_batch;
@@ -1974,7 +2057,11 @@ void llama_context::opt_epoch_iter(

        int64_t n_outputs_all = n_tokens_all;

-        llama_sbatch sbatch = kv_self->sbatch_init(batch, /*logits_all =*/ true);
+        auto kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true);
+        if (!kv_state || kv_state->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
+            LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
+            break;
+        }

        // reserve output buffer
        if (output_reserve(n_outputs_all) < n_outputs_all) {
@@ -1982,20 +2069,19 @@ void llama_context::opt_epoch_iter(
            GGML_ABORT("TODO: handle this error");
        };

-        for (uint32_t pos_batch = 0; pos_batch < n_batch; pos_batch += n_ubatch) {
-            llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
+        uint32_t pos_batch = 0;
+        do {
+            const auto & ubatch = kv_state->get_ubatch();

            n_outputs = ubatch.n_tokens;

-            // TODO: not sure if this is needed
-            if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-                GGML_ABORT("TODO: handle this error");
+            if (!kv_state->apply()) {
+                LLAMA_LOG_ERROR("%s: failed to update the memory state\n", __func__);
+                break;
            }

            auto * gf = graph_init();
-            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, kv_state.get());

            struct ggml_context * ctx_compute_opt;
            {
@@ -2010,6 +2096,7 @@ void llama_context::opt_epoch_iter(
            }
            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
            ggml_opt_alloc(opt_ctx, train);
+
            res->set_inputs(&ubatch);
            {
                struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
@@ -2027,10 +2114,10 @@ void llama_context::opt_epoch_iter(
                callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start);
            }
            ggml_free(ctx_compute_opt);
-        }
-    }

-    kv_guard.commit();
+            pos_batch += ubatch.n_tokens;
+        } while (kv_state->next());
+    }
 }

 void llama_context::opt_epoch(
@@ -2194,6 +2281,7 @@ llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
    return ctx->get_kv_self();
 }

+// deprecated
 void llama_kv_self_update(llama_context * ctx) {
    ctx->kv_self_update();
 }
@@ -2448,6 +2536,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
    return kv->seq_pos_max(seq_id);
 }

+// deprecated
 void llama_kv_self_defrag(llama_context * ctx) {
    auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2589,22 +2678,8 @@ int32_t llama_encode(
 int32_t llama_decode(
        llama_context * ctx,
          llama_batch   batch) {
-    int ret = ctx->decode(batch);
-
-    // defrag and try again
-    // TODO: distinguish return code when we are sure that even after defrag there is no space available
-    if (ret == 1) {
-        llama_kv_self_defrag(ctx);
-        ret = ctx->decode(batch);
-
-        if (ret == 1) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
-
-            return ret;
-        }
-    }
-
-    if (ret != 0) {
+    const int ret = ctx->decode(batch);
+    if (ret != 0 && ret != 1) {
        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
    }

@@ -18,6 +18,9 @@ struct llama_kv_cache;
 class llama_io_read_i;
 class llama_io_write_i;

+class llama_memory_i;
+class llama_memory_state_i;
+
 struct llama_context {
    // init scheduler and compute buffers, reserve worst-case graphs
    llama_context(
@@ -47,7 +50,9 @@ struct llama_context {
          llama_kv_cache * get_kv_self();
    const llama_kv_cache * get_kv_self() const;

-    void kv_self_update();
+    // return true of the KV cache was updated
+    // TODO: remove
+    bool kv_self_update();

    enum llama_pooling_type pooling_type() const;

@@ -88,6 +93,16 @@ struct llama_context {
                int32_t   il_start,
                int32_t   il_end);

+    // process a single ubatch with a specific graph type
+    // if memory_state is provided, it will be applied first to the context's memory
+    // ret contains the status of the graph computation
+    // returns nullptr only if ret != GGML_STATUS_SUCCESS
+    llm_graph_result_ptr process_ubatch(
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+            llama_memory_state_i * mstate,
+                     ggml_status & ret);
+
    int encode(llama_batch & inp_batch);
    int decode(llama_batch & inp_batch);

@@ -180,16 +195,18 @@ public:
    ggml_cgraph * graph_init();

    // returns the result of ggml_backend_sched_graph_compute_async execution
-    ggml_status graph_compute(
-            ggml_cgraph * gf,
-                   bool   batched);
+    ggml_status graph_compute(ggml_cgraph * gf, bool batched);
+
+    // reserve a graph with a dummy ubatch of the specified size
+    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate);

 private:
    llm_graph_result_ptr graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-          llm_graph_type   gtype);
+                    ggml_context * ctx,
+                     ggml_cgraph * gf,
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+      const llama_memory_state_i * mstate);

    llm_graph_cb graph_get_cb() const;

@@ -3,7 +3,10 @@
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
-#include "llama-kv-cache.h"
+
+#include "llama-kv-cache-unified.h"
+#include "llama-kv-cache-unified-iswa.h"
+#include "llama-kv-cache-recurrent.h"

 #include <cassert>
 #include <cmath>
@@ -83,7 +86,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {

 void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
    if (pos_bucket) {
-        kv_self->set_input_pos_bucket(pos_bucket, ubatch);
+        kv_state->set_input_pos_bucket(pos_bucket, ubatch);
    }
 }

@@ -234,7 +237,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
 void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
    GGML_UNUSED(ubatch);

-    const int64_t n_kv = kv_self->n;
+    const int64_t n_kv = kv_state->get_n_kv();

    if (s_copy) {
        GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
@@ -242,7 +245,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {

        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
        for (uint32_t i = 0; i < n_kv; ++i) {
-            data[i] = kv_self->s_copy(i);
+            data[i] = kv_state->s_copy(i);
        }
    }
 }
@@ -250,7 +253,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
 void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
    GGML_UNUSED(ubatch);

-    const int64_t n_kv = kv_self->n;
+    const int64_t n_kv = kv_state->get_n_kv();

    if (s_mask) {
        GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
@@ -258,7 +261,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {

        // clear unused states
        for (int i = 0; i < n_kv; ++i) {
-            data[i] = kv_self->s_mask(i);
+            data[i] = kv_state->s_mask(i);
        }
    }
 }
@@ -362,17 +365,17 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {

 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
    if (self_kq_mask) {
-        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        kv_state->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
    }
 }

 void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
    if (self_kq_mask) {
-        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        kv_state->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
    }

    if (self_kq_mask_swa) {
-        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+        kv_state->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
    }
 }

@@ -448,7 +451,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    backend_cpu      (params.backend_cpu),
    cvec             (params.cvec),
    loras            (params.loras),
-    memory           (params.memory),
+    mstate           (params.mstate),
    cross            (params.cross),
    cb_func          (params.cb),
    res              (std::make_unique<llm_graph_result>()) {
@@ -954,11 +957,11 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
 }

 ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);

-    auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
+    auto inp = std::make_unique<llm_graph_input_s_copy>(kv_state);

-    const auto n_kv = kv_self->n;
+    const auto n_kv = kv_state->get_n_kv();

    auto & cur = inp->s_copy;

@@ -971,11 +974,11 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
 }

 ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);

-    auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
+    auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);

-    const auto n_kv = kv_self->n;
+    const auto n_kv = kv_state->get_n_kv();

    auto & cur = inp->s_mask;

@@ -1025,11 +1028,11 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
 }

 ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);

-    auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
+    auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_state);

-    const auto n_kv = kv_self->get_n();
+    const auto n_kv = kv_state->get_n_kv();

    auto & cur = inp->pos_bucket;

@@ -1231,14 +1234,14 @@ ggml_tensor * llm_graph_context::build_attn(
 }

 llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);

-    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_state);

    {
        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");

-        const auto n_kv = kv_self->get_n();
+        const auto n_kv = kv_state->get_n_kv();

        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
        //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1268,19 +1271,19 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_build_forward_expand(gf, k_cur);
    ggml_build_forward_expand(gf, v_cur);

-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);

    // store to KV cache
    {
-        ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
    }

    const auto & kq_mask = inp->get_kq_mask();

    ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv_self->get_k(ctx0, il);
-    ggml_tensor * v = kv_self->get_v(ctx0, il);
+    ggml_tensor * k = kv_state->get_k(ctx0, il);
+    ggml_tensor * v = kv_state->get_v(ctx0, il);

    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);
@@ -1301,12 +1304,12 @@ ggml_tensor * llm_graph_context::build_attn(
 }

 llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
-    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_unified_iswa_state *>(mstate);

-    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_state);

    {
-        const auto n_kv = kv_self->get_kv_base()->get_n();
+        const auto n_kv = kv_state->get_base()->get_n_kv();

        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
        //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1318,7 +1321,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
    {
        GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");

-        const auto n_kv = kv_self->get_kv_swa()->get_n();
+        const auto n_kv = kv_state->get_swa()->get_n_kv();

        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
@@ -1348,23 +1351,23 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_build_forward_expand(gf, k_cur);
    ggml_build_forward_expand(gf, v_cur);

+    const auto * kv_state_iswa = static_cast<const llama_kv_cache_unified_iswa_state *>(mstate);
+
    const bool is_swa = hparams.is_swa(il);

-    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
-
-    const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
+    const auto * kv_state = is_swa ? kv_state_iswa->get_swa() : kv_state_iswa->get_base();

    // store to KV cache
    {
-        ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
    }

    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();

    ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv->get_k(ctx0, il);
-    ggml_tensor * v = kv->get_v(ctx0, il);
+    ggml_tensor * k = kv_state->get_k(ctx0, il);
+    ggml_tensor * v = kv_state->get_v(ctx0, il);

    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);
@@ -1446,12 +1449,12 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
         ggml_tensor * state_mask,
             int32_t   n_state,
             int32_t   n_seqs) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);

-    const auto n_kv    = kv_self->n;
-    const auto kv_head = kv_self->head;
+    const auto n_kv    = kv_state->get_n_kv();
+    const auto kv_head = kv_state->get_head();

-    ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self->size);
+    ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_state->get_size());

    // copy states
    // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
@@ -1478,13 +1481,13 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
         ggml_tensor * state_mask,
  const llama_ubatch & ubatch,
                 int   il) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);

    const auto token_shift_count = hparams.token_shift_count;

    const int64_t n_seqs  = ubatch.n_seqs;

-    ggml_tensor * token_shift_all = kv_self->k_l[il];
+    ggml_tensor * token_shift_all = kv_state->get_k_l(il);

    ggml_tensor * token_shift = build_copy_mask_state(
            gf, token_shift_all, state_copy, state_mask,
@@ -1499,19 +1502,19 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
         ggml_tensor * token_shift,
  const llama_ubatch & ubatch,
                 int   il) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);

    const auto token_shift_count = hparams.token_shift_count;
    const auto n_embd = hparams.n_embd;

    const int64_t n_seqs = ubatch.n_seqs;

-    const auto kv_head = kv_self->head;
+    const auto kv_head = kv_state->get_head();

    return ggml_cpy(
        ctx0,
        ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
-        ggml_view_1d(ctx0, kv_self->k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self->k_l[il]))
+        ggml_view_1d(ctx0, kv_state->get_k_l(il), hparams.n_embd_k_s()*n_seqs, hparams.n_embd_k_s()*kv_head*ggml_element_size(kv_state->get_k_l(il)))
    );
 }

@@ -17,10 +17,11 @@ struct ggml_tensor;
 struct llama_ubatch;
 struct llama_cparams;

-class llama_memory_i;
-class llama_kv_cache_unified;
-class llama_kv_cache_unified_iswa;
-class llama_kv_cache_recurrent;
+class llama_memory_state_i;
+
+class llama_kv_cache_unified_state;
+class llama_kv_cache_unified_iswa_state;
+class llama_kv_cache_recurrent_state;

 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -133,7 +134,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket_kv(
            const llama_hparams & hparams,
-            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
+            const llama_kv_cache_unified_state * kv_state) : hparams(hparams), kv_state(kv_state) {}
    virtual ~llm_graph_input_pos_bucket_kv() = default;

    void set_input(const llama_ubatch * ubatch) override;
@@ -141,7 +142,7 @@ public:
    ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]

    const llama_hparams & hparams;
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_state * kv_state;
 };

 class llm_graph_input_out_ids : public llm_graph_input_i {
@@ -188,26 +189,26 @@ public:

 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_copy(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
    virtual ~llm_graph_input_s_copy() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * s_copy; // I32 [kv_size]

-    const llama_kv_cache_recurrent * kv_self;
+    const llama_kv_cache_recurrent_state * kv_state;
 };

 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
-    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_mask(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
    virtual ~llm_graph_input_s_mask() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * s_mask; // F32 [1, n_kv]

-    const llama_kv_cache_recurrent * kv_self;
+    const llama_kv_cache_recurrent_state * kv_state;
 };

 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -247,10 +248,10 @@ public:
    llm_graph_input_attn_kv_unified(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
-            const llama_kv_cache_unified * kv_self) :
+            const llama_kv_cache_unified_state * kv_state) :
        hparams(hparams),
        cparams(cparams),
-        kv_self(kv_self) {
+        kv_state(kv_state) {
    }
    ~llm_graph_input_attn_kv_unified() = default;

@@ -264,7 +265,7 @@ public:
    const llama_hparams & hparams;
    const llama_cparams & cparams;

-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_state * kv_state;
 };

 class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
@@ -272,10 +273,10 @@ public:
    llm_graph_input_attn_kv_unified_iswa(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
-            const llama_kv_cache_unified_iswa * kv_self) :
+            const llama_kv_cache_unified_iswa_state * kv_state) :
        hparams(hparams),
        cparams(cparams),
-        kv_self(kv_self) {
+        kv_state(kv_state) {
    }
    ~llm_graph_input_attn_kv_unified_iswa() = default;

@@ -292,7 +293,7 @@ public:
    const llama_hparams & hparams;
    const llama_cparams & cparams;

-    const llama_kv_cache_unified_iswa * kv_self;
+    const llama_kv_cache_unified_iswa_state * kv_state;
 };

 class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -383,10 +384,10 @@ struct llm_graph_params {
    ggml_backend_sched_t sched;
    ggml_backend_t backend_cpu;

-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec   * cvec;
+    const llama_adapter_loras  * loras;
+    const llama_memory_state_i * mstate;
+    const llama_cross          * cross;

    int32_t n_outputs;

@@ -435,10 +436,10 @@ struct llm_graph_context {

    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?

-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec   * cvec;
+    const llama_adapter_loras  * loras;
+    const llama_memory_state_i * mstate;
+    const llama_cross          * cross;

    const llm_graph_cb & cb_func;

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
shalinib-ibm	093e3f1feb	cmake : Handle mixed-case 'Power' strings in POWER CPU detection (#13966 ) Some systems report the CPU implementation as "Power11" instead of "POWER11". The existing CMake logic uses a case-sensitive regular expression to extract the CPU generation, which fails when the casing doesn't exactly match "POWER". This patch provides a fix by first converting the string to uppercase before applying the regex. Signed-off-by: root <root@rheldb2v.pperf.tadn.ibm.com> Co-authored-by: root <root@rheldb2v.pperf.tadn.ibm.com>	2025-06-02 15:18:36 +03:00
Atharva Dubey	663445b0de	sycl: quantize and reorder the input to q8_1 when reorder is enabled (#13826 ) * [WIP]: fuse q8 quantization and reorder * wip2: fuse q8 quantization and reorder * working q8 reorder commit * restored common.hpp * remove debug prints * remove unnecessary headers and remove trailing whitespace * Update ggml/src/ggml-sycl/ggml-sycl.cpp Co-authored-by: Alberto Cabrera Pérez <alberto.cabrera@intel.com> --------- Co-authored-by: Alberto Cabrera Pérez <alberto.cabrera@intel.com>	2025-06-02 10:12:20 +01:00
Johannes Gäßler	7675c555a1	gguf: fix failure on version == 0 (#13956 )	2025-06-01 18:08:05 +02:00
Sigbjørn Skjæret	5e1c3aed40	convert : fix nomic-bert-moe mask token (#13757 )	2025-06-01 18:07:21 +02:00
Sigbjørn Skjæret	c496fe0b1d	convert : fix vocab padding code for bert models (#13954 )	2025-06-01 17:23:11 +02:00
Aaron Teo	e57bb87ced	ggml: check if non-native endian model is being loaded (#13943 ) * gguf: prevent non-native endian models from being loaded Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * gguf: update error message Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * gguf: make the non-native endian check more verbose Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml: move ggml_assert location Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml: reword the endianness check error message Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> --------- Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>	2025-06-01 16:53:57 +02:00
Georgi Gerganov	f3a4b1659c	sync : ggml ggml-ci	2025-06-01 13:43:57 +03:00
Kai Pastor	108009f5c7	vulkan : Remove unexpected ; (ggml/1253)	2025-06-01 13:43:57 +03:00
Kai Pastor	d337252acf	cmake : Fix broken CMake error messages (ggml/1252)	2025-06-01 13:43:57 +03:00
Radoslav Gerganov	af6f91db47	ggml : remove ggml_graph_import and ggml_graph_export declarations (ggml/1247) The implementation is already deleted with commit 9d0762e. closes: #1235	2025-06-01 13:43:57 +03:00
Georgi Gerganov	a7b8d35f78	sync : whisper.cpp (ggml/1250) * ggml : Fix backtrace breaking Windows build (whisper/3203) * sync : whisper.cpp ggml-ci --------- Co-authored-by: Daniel Tang <danielzgtg.opensource@gmail.com>	2025-06-01 13:43:57 +03:00
Radoslav Gerganov	6eba72b71c	ggml : install dynamic backends (ggml/1240) * ggml : install dynamic backends Make sure dynamic backends are installed in $CMAKE_INSTALL_BINDIR	2025-06-01 13:43:57 +03:00
Daniel Tang	fedf034a98	ggml : Print backtrace on uncaught C++ exceptions (ggml/1232) The goal is to have what users call "full logs" contain the backtrace. This is registered upon ggml_init. Also fixes a minor fd leak on Linux.	2025-06-01 13:43:57 +03:00
ddh0	8726392d3d	readme : update bindings (#13950 )	2025-06-01 11:44:30 +03:00
Georgi Gerganov	c04621711a	parallel : fix n_junk == 0 (#13952 )	2025-06-01 11:42:16 +03:00
Georgi Gerganov	0fc16b42e8	kv-cache : split implementation in separate sources (#13920 ) ggml-ci	2025-06-01 11:39:27 +03:00
Max Krasnyansky	053b1539c0	threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling (#12995 ) * threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling We talked about adding LOW priority for GGML threads in the original threadpool PR. It might be useful for some cases to avoid contention. Latest Windows ARM64 releases started parking (offlining) the CPU cores more aggresively which results in suboptimal performance with n_threads > 4. To deal with that we now disable Power Throttling for our threads for the NORMAL and higher priorities. Co-authored-by: Diego Devesa <slarengh@gmail.com> * threading: disable SetThreadInfo() calls for older Windows versions * Update tools/llama-bench/llama-bench.cpp Co-authored-by: Diego Devesa <slarengh@gmail.com> --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>	2025-05-31 15:39:19 -07:00
Jiří Podivín	b3a89c3d9e	docs : Note about necessity of having libcurl installed for standard build. (#13945 ) Signed-off-by: Jiri Podivin <jpodivin@gmail.com>	2025-05-31 18:58:35 +02:00
Olivier Chafik	e15898d1c7	server: allow unclosed thinking tags (#13931 )	2025-05-31 08:26:10 -07:00
Georgi Gerganov	803f8baf4f	llama : deprecate explicit kv_self defrag/update calls (#13921 ) ggml-ci	2025-05-31 15:58:33 +03:00
Georgi Gerganov	3600cc2886	llama : use n_swa + n_ubatch cells for SWA cache (#13833 ) * llama : use n_swa + n_ubatch cells for SWA cache ggml-ci * llama : add warning about multi-sqeuence SWA contexts	2025-05-31 15:57:44 +03:00
igardev	c7e0a2054b	webui : Replace alert and confirm with custom modals. (#13711 ) * Replace alert and confirm with custom modals. This is needed as Webview in VS Code doesn't permit alert and confirm for security reasons. * use Modal Provider to simplify the use of confirm and alert modals. * Increase the z index of the modal dialogs. * Update index.html.gz * also add showPrompt * rebuild --------- Co-authored-by: igardev <ivailo.gardev@akros.ch> Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2025-05-31 11:56:08 +02:00
Georgi Gerganov	3f55f781f1	llama : auto-batch preparation (#13845 ) * llama : auto-batch ggml-ci * context : simplify if branching	2025-05-31 12:55:57 +03:00
Xuan-Son Nguyen	51fa76f172	mtmd : drop `_shared` from `libmtmd` name, merge helpers into libmtmd (⚠️ breaking change) (#13917 ) * mtmd : fix missing public header * no object * apply suggestion from Georgi * rm mtmd-helper, merge it to mtmd * missing vendor include dir	2025-05-31 10:14:29 +02:00
Georgi Gerganov	12d0188c0d	kv-cache : refactor + add llama_memory_state_i (#13746 ) * kv-cache : simplify the "struct llama_kv_cache" interface ggml-ci * kv-cache : revert the (n_swa + n_ubatch) change (for next PR) ggml-ci * kv-cache : some comments ggml-ci * context : fix graph reserve for multiple sequences ggml-ci * kv-cache : fix typo [no ci] * kv-cache : fix find_slot() logic for free slots ggml-ci * llama : add TODO for deprecating the defrag API in the future * kv-cache : improve find_slot() using min/max seq pos info ggml-ci * llama : handle aborts and compute errors ggml-ci * memory : extract state into llama_memory_state ggml-ci * kv-cache : add comments ggml-ci * server : update batching logic to reset n_batch on successful decode * server : upon full re-processing, remove the sequence from the cache * kv-cache : add TODO for doing split_equal when split_simple fails ggml-ci	2025-05-31 10:24:04 +03:00
Shawn yang	eb3949938e	CUDA: add a prop in ggml_cuda_device_infor for distinguish iGPU or dGPU in cuda (#13856 ) (#13895 ) * 1. add "integrated" in ggml_cuda_device_info for distinguish whether it is Intergrate_gpu or discrete_gpu 2. Adjust the func:"ggml_backend_cuda_device_supports_buft" for this new feature * Update ggml/src/ggml-cuda/ggml-cuda.cu Adjusted code indentation Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update ggml/src/ggml-cuda/ggml-cuda.cu Fixed incorrect setting of variable types Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update ggml/src/ggml-cuda/ggml-cuda.cu Adjusted the judgment logic Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * add a host_buft assert in case of integrated_cuda_device with func:'evaluate_and_capture_cuda_graph()' * Update ggml/src/ggml-cuda/ggml-cuda.cu Add a defensive security assert Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update ggml/src/ggml-cuda/ggml-cuda.cu Adjusted the support judgment logic. Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * revoke the suggest commit changes due to it's not applicable in jetson_device * Update ggml/src/ggml-cuda/ggml-cuda.cu Add parentheses to enforce operator precedence Co-authored-by: Diego Devesa <slarengh@gmail.com> * Update ggml/src/ggml-cuda/ggml-cuda.cu Fix ci bug: add a spaces Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: yangxiao <yang_xl@tju.edu.cn> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: yangxiao <yangxl_zz@qq.com> Co-authored-by: Diego Devesa <slarengh@gmail.com>	2025-05-31 08:48:04 +02:00
Johannes Gäßler	e562eece7c	CUDA: fix typo in FlashAttention code (#13926 )	2025-05-30 21:22:03 +02:00
Diego Devesa	b47ab7b8e9	sched : avoid changing cur_copy when a graph is already allocated (#13922 )	2025-05-30 18:56:19 +02:00
Georgi Gerganov	dd665cc9d4	parallel : increase the variability of the prompt lengths (#13927 ) ggml-ci	2025-05-30 19:38:07 +03:00
Diego Devesa	df0c0c7d02	cuda : prevent using split buffers with 3d/4d matrices (#13919 )	2025-05-30 16:37:18 +02:00
Akarshan Biswas	b49a8ff96b	SYCL: Add mrope kernel (#13755 ) * SYCL: Add mrope kernel * feat: Optimize rope operations with vectorization Uses `sycl::vec` to load and store two elements at a time, significantly improving performance in `rope_norm`, `rope_neox`, and `rope_multi`. This reduces the number of memory accesses and leverages SIMD instructions for faster execution. * Use ceil_div	2025-05-30 19:40:57 +05:30
Georgi Gerganov	53f925074d	sync : vendor (#13901 ) * sync : vendor ggml-ci * cont : fix httplib version ggml-ci * cont : fix lint * cont : fix lint * vendor : move to common folder /vendor ggml-ci * cont : fix lint * cont : move httplib to /vendor + use json_fwd.hpp ggml-ci * cont : fix server build ggml-ci * cont : add missing headers ggml-ci * cont : header clean-up ggml-ci	2025-05-30 16:25:45 +03:00
Sigbjørn Skjæret	db38704f01	convert : fix rwkv bos/eos token (#13844 )	2025-05-30 14:50:43 +02:00
Xuan-Son Nguyen	07e4351ce6	convert : allow partial update to the chkhsh pre-tokenizer list (#13847 ) * convert : allow partial update to the chkhsh pre-tokenizer list * code style * update tokenizer out * rm inp/out files for models not having gguf * fixed hash for glm * skip nomic-bert-moe test * Update convert_hf_to_gguf_update.py * fix minerva-7b hash * rm redundant import	2025-05-30 12:24:37 +02:00
Đinh Trọng Huy	291f2b6913	llama : add support for DistilBert (#13907 ) * add distilbert * small fixes * add note for LLM_ARCH_DISTIL_BERT * Use MODEL_ARCH.BERT for DistilBert --------- Co-authored-by: dinhhuy <huy.dinh@brains-tech.co.jp>	2025-05-30 11:56:02 +02:00
zhangkaihuo	2c90da4c7e	llama : use llm_build_granite for minicpm (#13911 )	2025-05-30 10:31:48 +02:00
Christian Kastner	ec9e0301fe	cmake: Guard GGML_CPU_ALL_VARIANTS by architecture (#13890 )	2025-05-30 01:28:54 +02:00
Sigbjørn Skjæret	e83ba3e460	llama : add support for jina-reranker-v2 (#13900 )	2025-05-29 21:42:31 +02:00
Sigbjørn Skjæret	2b131621e6	gguf-py : add support for sub_type (in arrays) in GGUFWriter add_key_value method (#13561 )	2025-05-29 15:36:05 +02:00
Yibo Cai	54a2c7a8cd	arm64: optimize q4_k_q8_k kernel with i8mm (#13886 ) This PR improves q4_k_q8_k gemm kernel with arm64 i8mm instruction. Tested on neoverse-n2 with llama3 8b q4_k_m quantization model. - 34% ~ 50% S_PP uplift for all batch sizes - 12% ~ 37% S_TG uplift for batch size 4 and above Perplexity doesn't change with this PR. ``` // tested on neoverse-n2 $ llama-batched-bench \ -m Meta-Llama-3-8B-Instruct-Q4_K_M.gguf \ --no-mmap -fa \ -c 8192 -b 4096 -ub 512 -npp 128 -ntg 128 \ -npl 1,2,4,8,16,32 \ -t 64 --------------------------------------------------------------------- \| PP \| TG \| B \| S_PP t/s \| S_TG t/s \| \| \| \| \| original \| this pr \| original \| this pr \| \|-------\|--------\|------\|----------\|----------\|----------\|----------\| \| 128 \| 128 \| 1 \| 110.12 \| 147.83 \| 24.36 \| 24.28 \| \| 128 \| 128 \| 2 \| 121.16 \| 172.42 \| 46.36 \| 47.93 \| \| 128 \| 128 \| 4 \| 120.15 \| 169.75 \| 74.68 \| 84.00 \| \| 128 \| 128 \| 8 \| 130.97 \| 196.81 \| 91.04 \| 114.74 \| \| 128 \| 128 \| 16 \| 131.01 \| 196.88 \| 101.43 \| 135.79 \| \| 128 \| 128 \| 32 \| 130.85 \| 196.51 \| 106.97 \| 147.29 \| --------------------------------------------------------------------- ```	2025-05-29 14:39:20 +03:00
Christian Kastner	21fcc21ad5	cmake: Factor out CPU architecture detection (#13883 ) * cmake: Define function for querying architecture The tests and results match exactly those of ggml/src/CMakeLists.txt * Switch arch detection over to new function	2025-05-29 12:50:25 +02:00
Vineel Abhinav	dd8ba93416	ggml: aarch64: Implement SVE F32 kernels for Mamba Sequential Scan Algorithm (#13882 ) * F32-Mamba-Seq_Scan-SVE * Fix formatting * ggml : missing space --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-05-29 12:18:43 +03:00
Georgi Gerganov	66c92061f5	tests : remove json.hpp from a test (#13880 ) ggml-ci	2025-05-29 12:17:16 +03:00
Sigbjørn Skjæret	5ca82fc1d7	convert : workaround for AutoConfig dummy labels (#13881 )	2025-05-29 10:00:57 +02:00