llama : add high-throughput mode (#14363 )

* kv-cache : prepare K/V buffers for separation ggml-ci * batched-bench : fix oob write ggml-ci * llama : add "virtual sequences" ggml-ci * llama : use "stream" vs "virtual sequence" ggml-ci * graph : fix stream splitting when KV cache is not used ggml-ci * kv-cache : add multi-stream save/load support ggml-ci * llama : add "--attn-streams" flag ggml-ci * kv-cache : fix handling when find_slot fails ggml-ci * kv-cache : restore find_slot impl ggml-ci * kv-cache : add comments * kv-cache : add bounds checks for sequence id ggml-ci * cont : add n_seq_max to batch allocr ggml-ci * kv-cache : perform stream copies lazily after llama_synchronize ggml-ci * kv-cache : avoid throwing exceptions across the C boundary ggml-ci * CUDA: 4D FlashAttention support (#14628) * CUDA: 4D FlashAttention support * CUDA: fix WMMA FA kernel * llama : rename attn_streams -> kv_unified ggml-ci * common : rename kv_split -> kv_unified ggml-ci --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Support diffusion models: Add Dream 7B (#14644 )
2026-07-04 19:45:57 +02:00 · 2025-07-16 16:35:42 +03:00 · 2025-07-16 20:03:51 +08:00 · 2025-07-16 14:43:32 +03:00 · 2025-07-16 14:04:12 +03:00 · 2025-07-16 12:12:22 +02:00
57 changed files with 2444 additions and 590 deletions
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.swa_full = true;
        }
    ).set_env("LLAMA_ARG_SWA_FULL"));
+    add_opt(common_arg(
+        {"--kv-unified", "-kvu"},
+        string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
+        [](common_params & params) {
+            params.kv_unified = true;
+        }
+    ).set_env("LLAMA_ARG_KV_SPLIT"));
    add_opt(common_arg(
        {"--no-context-shift"},
        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -3423,5 +3431,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));

+    // diffusion parameters
+    add_opt(common_arg(
+        { "--diffusion-steps" }, "N",
+        string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
+        [](common_params & params, int value) { params.diffusion.steps = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-eps" }, "F",
+        string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
+        [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-algorithm" }, "N",
+        string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
+                      params.diffusion.algorithm),
+        [](common_params & params, int value) { params.diffusion.algorithm = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-alg-temp" }, "F",
+        string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+        [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-visual" },
+        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
+                      params.diffusion.visual_mode ? "true" : "false"),
+        [](common_params & params) { params.diffusion.visual_mode = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
    return ctx_arg;
 }
@@ -1005,15 +1005,21 @@ struct common_init_result common_init_from_params(common_params & params) {
        params.sampling.ignore_eos = false;
    }

-    if (params.sampling.ignore_eos) {
-        for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
-            if (llama_vocab_is_eog(vocab, i)) {
-                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
-                params.sampling.logit_bias.push_back({i, -INFINITY});
-            }
+    // initialize once
+    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+        if (llama_vocab_is_eog(vocab, i)) {
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
+            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }

+    if (params.sampling.ignore_eos) {
+        // add EOG biases to the active set of logit biases
+        params.sampling.logit_bias.insert(
+                params.sampling.logit_bias.end(),
+                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
+    }
+
    if (params.sampling.penalty_last_n == -1) {
        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
        params.sampling.penalty_last_n = llama_n_ctx(lctx);
@@ -1157,6 +1163,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
    cparams.swa_full          = params.swa_full;
+    cparams.kv_unified        = params.kv_unified;

    cparams.type_k = params.cache_type_k;
    cparams.type_v = params.cache_type_v;
@@ -81,6 +81,7 @@ enum llama_example {
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_DIFFUSION,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -177,7 +178,8 @@ struct common_params_sampling {
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
    std::set<llama_token>               preserved_tokens;

-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
+    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

    // print the parameters into a string
    std::string print() const;
@@ -217,6 +219,14 @@ struct common_params_vocoder {
    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };

+struct common_params_diffusion {
+    int32_t steps       = 64;     // number of diffusion steps
+    float   eps         = 1e-3f;  // epsilon for timesteps
+    int32_t algorithm   = 0;      // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
+    float   alg_temp    = 0.0f;   // algorithm temperature
+    bool    visual_mode = false;  // show progressive diffusion on screen
+};
+
 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
@@ -268,6 +278,7 @@ struct common_params {
    struct common_params_sampling    sampling;
    struct common_params_speculative speculative;
    struct common_params_vocoder     vocoder;
+    struct common_params_diffusion   diffusion;

    struct common_params_model model;

@@ -330,6 +341,7 @@ struct common_params {
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = true;  // context shift on inifinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    bool kv_unified        = false; // enable unified KV cache

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool use_mmap          = true;  // use mmap for faster loads
@@ -669,6 +669,36 @@ class TextModel(ModelBase):
        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
+        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"
+        if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"
+        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
+            res = "glm4"
+        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
+            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
+            res = "minerva-7b"
+        if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
+            # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
+            res = "hunyuan"
+        if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
+            # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
+            res = "falcon-h1"
+        if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
+            # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
+            res = "falcon-h1"
+        if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
+            # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
+            res = "falcon-h1"
+        if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
+            # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
+            res = "falcon-h1"
+        if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
+            # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
+            res = "kimi-k2"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@@ -804,36 +834,9 @@ class TextModel(ModelBase):
        if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
            # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
            res = "seed-coder"
-        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
-            res = "chatglm-bpe"
-        if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
-            res = "chatglm-bpe"
-        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
-            res = "glm4"
-        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
-            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
-            res = "minerva-7b"
-        if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
-            # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
-            res = "hunyuan"
        if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
            # ref: https://huggingface.co/skt/A.X-4.0
            res = "a.x-4.0"
-        if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
-            # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
-            res = "falcon-h1"
-        if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
-            # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
-            res = "falcon-h1"
-        if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
-            # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
-            res = "falcon-h1"
-        if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
-            # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
-            res = "falcon-h1"
        if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
            # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
            res = "midm-2.0"
@@ -2775,6 +2778,76 @@ class Qwen2Model(TextModel):
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("DreamModel")
+class DreamModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.DREAM
+
+    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
+        tokens: list[str] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
+
+        vocab_dict = tokenizer.get_vocab()
+        vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
+        assert max(vocab_dict.values()) < vocab_size
+
+        tokpre = self.get_vocab_base_pre(tokenizer)
+
+        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
+        added_vocab = tokenizer.get_added_vocab()
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.UNUSED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                # Check if it's a special token - treat special tokens as CONTROL tokens
+                if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
+                    if tokenizer.added_tokens_decoder[i].special:
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        toktypes.append(gguf.TokenType.USER_DEFINED)
+                else:
+                    # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
+                    toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        return tokens, toktypes, tokpre
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self._try_set_pooling_type()
+
+        # Dream models use non-causal attention for diffusion
+        self.gguf_writer.add_causal_attention(False)
+        # Handle RoPE scaling similar to Qwen2
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+
+        # Add Dream-specific parameters
+        mask_token_id = self.hparams.get("mask_token_id")
+        if mask_token_id is not None:
+            self.gguf_writer.add_mask_token_id(mask_token_id)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Dream model tensors should be mapped directly since it's the base model
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("Ernie4_5_ForCausalLM")
 class Ernie4_5Model(TextModel):
    model_arch = gguf.MODEL_ARCH.ERNIE4_5
@@ -5739,7 +5812,58 @@ class DeepseekV2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.DEEPSEEK2

    def set_vocab(self):
-        self._set_vocab_gpt2()
+        try:
+            self._set_vocab_gpt2()
+            return
+        except Exception:
+            pass
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
+        tokpre = self.get_vocab_base_pre(tokenizer)
+
+        if tokpre == "kimi-k2":
+            # Build merges list using the approach similar to HunYuanMoE
+            merges = []
+            vocab = {}
+            mergeable_ranks = tokenizer.model._mergeable_ranks
+            for token, rank in mergeable_ranks.items():
+                vocab[QwenModel.token_bytes_to_string(token)] = rank
+                if len(token) == 1:
+                    continue
+                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+                if len(merged) == 2:
+                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
+
+            # Build token list
+            vocab_size = self.hparams["vocab_size"]
+            special_tokens = tokenizer.special_tokens
+            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
+            tokens: list[str] = []
+            toktypes: list[int] = []
+
+            for i in range(vocab_size):
+                if i not in reverse_vocab:
+                    tokens.append(f"[PAD{i}]")
+                    toktypes.append(gguf.TokenType.UNUSED)
+                else:
+                    token = reverse_vocab[i]
+                    tokens.append(token)
+                    if i in special_tokens.values():
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        toktypes.append(gguf.TokenType.NORMAL)
+
+            self.gguf_writer.add_tokenizer_model("gpt2")
+            self.gguf_writer.add_tokenizer_pre(tokpre)
+            self.gguf_writer.add_token_list(tokens)
+            self.gguf_writer.add_token_types(toktypes)
+            self.gguf_writer.add_token_merges(merges)
+
+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+            special_vocab.add_to_gguf(self.gguf_writer)
+        else:
+            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")

    def set_gguf_parameters(self):

@@ -146,6 +146,7 @@ pre_computed_hashes = [
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
+    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
 ]


@@ -231,7 +232,7 @@ for model in models:
 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:

 src_ifs = ""
-for model in [*all_models, *pre_computed_hashes]:
+for model in [*pre_computed_hashes, *all_models]:
    name = model["name"]
    tokt = model["tokt"]
    chkhsh = model.get("chkhsh")
@@ -239,11 +240,6 @@ for model in [*all_models, *pre_computed_hashes]:
    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
        continue

-    # Skip if the tokenizer folder does not exist or there are other download issues previously
-    if not os.path.exists(f"models/tokenizers/{name}"):
-        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
-        continue
-
    # create the tokenizer
    if chkhsh is not None:
        # if the model has a pre-computed hash, use it
@@ -253,6 +249,12 @@ for model in [*all_models, *pre_computed_hashes]:
        chkhsh = existing_models[name]
    else:
        # otherwise, compute the hash of the tokenizer
+
+        # Skip if the tokenizer folder does not exist or there are other download issues previously
+        if not os.path.exists(f"models/tokenizers/{name}"):
+            logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
+            continue
+
        try:
            logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
            if name == "t5":
@@ -33,6 +33,7 @@ else()
    add_subdirectory(speculative-simple)
    add_subdirectory(gen-docs)
    add_subdirectory(training)
+    add_subdirectory(diffusion)
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
@@ -0,0 +1,5 @@
+set(TARGET llama-diffusion-cli)
+add_executable(${TARGET} diffusion-cli.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -0,0 +1,507 @@
+#include "arg.h"
+#include "chat.h"
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+
+#include <limits.h>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <random>
+
+typedef bool (*diffusion_step_callback_t)(int32_t step,
+                                          int32_t total_steps,
+                                          const llama_token * tokens,
+                                          int32_t n_tokens,
+                                          void * user_data);
+
+enum diffusion_alg {
+    DIFFUSION_ALG_ORIGIN       = 0,
+    DIFFUSION_ALG_MASKGIT_PLUS = 1,
+    DIFFUSION_ALG_TOPK_MARGIN  = 2,
+    DIFFUSION_ALG_ENTROPY      = 3,
+};
+
+struct diffusion_params {
+    int32_t                   steps;
+    float                     eps;
+    float                     temperature;
+    float                     top_p;
+    int32_t                   top_k;
+    llama_token               mask_token_id;
+    enum diffusion_alg        algorithm;
+    float                     alg_temp;
+    diffusion_step_callback_t step_callback;
+    void *                    step_callback_user_data;
+    int32_t                   seed;
+};
+
+
+static diffusion_params diffusion_default_params() {
+    diffusion_params params        = {};
+    params.steps                   = 64;
+    params.eps                     = 1e-3f;
+    params.temperature             = 0.2f;
+    params.top_p                   = 0.95f;
+    params.top_k                   = 0;
+    params.mask_token_id           = LLAMA_TOKEN_NULL;
+    params.algorithm               = DIFFUSION_ALG_ORIGIN;
+    params.alg_temp                = 0.0f;
+    params.step_callback           = nullptr;
+    params.step_callback_user_data = nullptr;
+    params.seed                    = 0;
+    return params;
+}
+
+static void diffusion_generate(llama_context * ctx,
+                        const llama_token * input_tokens,
+                        llama_token * output_tokens,
+                        int32_t n_input,
+                        int32_t max_length,
+                        struct diffusion_params params,
+                        int32_t & n_generated) {
+
+    n_generated = 0;
+    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) {
+        return;
+    }
+
+    const llama_model * model = llama_get_model(ctx);
+
+    // Initialize with input and pad with mask tokens
+    std::copy(input_tokens, input_tokens + n_input, output_tokens);
+    std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id);
+
+    std::mt19937 rng(params.seed);
+
+    std::vector<float> timesteps(params.steps + 1);
+    for (int32_t i = 0; i <= params.steps; i++) {
+        timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps);
+    }
+
+    llama_set_causal_attn(ctx, false);
+
+    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+
+    std::vector<llama_token_data> candidates(n_vocab);
+
+    std::vector<llama_token_data> conf_candidates;
+    conf_candidates.reserve(max_length);
+
+    std::vector<int32_t> mask_positions;
+    mask_positions.reserve(max_length);
+
+    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
+    if (params.top_k > 0) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
+    }
+    if (params.top_p < 1.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
+    }
+    if (params.temperature > 0.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
+    }
+    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
+
+    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
+
+    llama_batch batch = llama_batch_init(max_length, 0, 1);
+    batch.n_tokens    = max_length;
+
+    int64_t total_sampling_time = 0;
+    int64_t total_time = 0;
+
+    int64_t time_start = ggml_time_us();
+    for (int32_t step = 0; step < params.steps; step++) {
+        if (params.step_callback) {
+            if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) {
+                break;
+            }
+        }
+
+        for (int32_t i = 0; i < max_length; i++) {
+            batch.token[i]     = output_tokens[i];
+            batch.pos[i]       = i;
+            batch.n_seq_id[i]  = 1;
+            batch.seq_id[i][0] = 0;
+            batch.logits[i]    = 1;
+        }
+
+        int ret = llama_decode(ctx, batch);
+        if (ret != 0) {
+            LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret);
+            break;
+        }
+
+        float * raw_logits = llama_get_logits(ctx);
+        if (!raw_logits) {
+            LOG_ERR("%s: failed to get logits at step %d\n", __func__, step);
+            break;
+        }
+
+        auto get_logits_for_pos = [&](int32_t pos) -> const float * {
+            return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab;
+        };
+
+        int64_t time_start_sampling = ggml_time_us();
+
+        mask_positions.clear();
+        for (int32_t i = 0; i < max_length; i++) {
+            if (output_tokens[i] == params.mask_token_id) {
+                mask_positions.push_back(i);
+            }
+        }
+
+        if (mask_positions.empty()) {
+            break;
+        }
+
+        float t = timesteps[step];
+        float s = timesteps[step + 1];
+
+        if (params.algorithm == DIFFUSION_ALG_ORIGIN) {
+            float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f;
+
+            for (int32_t pos : mask_positions) {
+                if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
+                    const float * pos_logits = get_logits_for_pos(pos);
+                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                        candidates[token_id].id    = token_id;
+                        candidates[token_id].logit = pos_logits[token_id];
+                        candidates[token_id].p     = 0.0f;
+                    }
+
+                    llama_token_data_array cur_p = {
+                        /* .data       = */ candidates.data(),
+                        /* .size       = */ (size_t) n_vocab,  // Reset size to full vocab
+                        /* .selected   = */ -1,
+                        /* .sorted     = */ false,
+                    };
+
+                    llama_sampler_apply(sampler, &cur_p);
+                    output_tokens[pos] = cur_p.data[cur_p.selected].id;
+                }
+            }
+        } else {
+            std::vector<std::pair<float, int32_t>> confidences;
+            std::vector<llama_token>               sampled_tokens(mask_positions.size());
+
+            for (size_t i = 0; i < mask_positions.size(); i++) {
+                int32_t       pos        = mask_positions[i];
+                const float * pos_logits = get_logits_for_pos(pos);
+
+                for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates[token_id].logit = pos_logits[token_id];
+                    candidates[token_id].p     = 0.0f;
+                    candidates[token_id].id    = token_id;
+                }
+
+                llama_token_data_array cur_p = {
+                    /* .data       = */ candidates.data(),
+                    /* .size       = */ candidates.size(),
+                    /* .selected   = */ -1,
+                    /* .sorted     = */ false,
+                };
+
+                llama_sampler_apply(sampler, &cur_p);
+
+                llama_token sampled_token = cur_p.data[cur_p.selected].id;
+
+                float confidence = 0.0f;
+                if (params.algorithm == DIFFUSION_ALG_ENTROPY) {
+                    const float epsilon = 1e-10f;
+                    for (size_t j = 0; j < cur_p.size; j++) {
+                        float prob = cur_p.data[j].p;
+                        confidence += prob * logf(prob + epsilon);
+                    }
+                } else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) {
+                    confidence = cur_p.data[0].p - cur_p.data[1].p;
+                } else {
+                    confidence = cur_p.data[cur_p.selected].p;
+                }
+
+                sampled_tokens[i] = sampled_token;
+                confidences.emplace_back(confidence, i);
+            }
+
+            int32_t num_transfer =
+                (step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size();
+
+            if (num_transfer > 0) {
+                if (params.alg_temp == 0.0f) {
+                    std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(),
+                                      [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
+                                          if (a.first != b.first) {
+                                              return a.first > b.first;
+                                          }
+                                          return a.second < b.second;
+                                      });
+                } else {
+                    conf_candidates.clear();
+
+                    for (int32_t pos = 0; pos < max_length; pos++) {
+                        float conf_logit = -std::numeric_limits<float>::infinity();
+
+                        auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
+                        if (it != mask_positions.end()) {
+                            size_t mask_idx = std::distance(mask_positions.begin(), it);
+                            conf_logit = confidences[mask_idx].first / params.alg_temp;  // Apply temperature scaling
+                        }
+
+                        conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f });
+                    }
+
+                    llama_token_data_array conf_array = {
+                        /* .data       = */ conf_candidates.data(),
+                        /* .size       = */ conf_candidates.size(),
+                        /* .selected   = */ -1,
+                        /* .sorted     = */ false,
+                    };
+
+                    for (int32_t i = 0; i < num_transfer; i++) {
+                        // Apply distribution sampler to get selected index
+                        llama_sampler_apply(dist_sampler, &conf_array);
+                        int selected_idx      = conf_array.selected;
+                        confidences[i].second = conf_candidates[selected_idx].id;
+
+                        conf_candidates[selected_idx].p = 0.0f;
+                        conf_array.selected             = -1;
+                    }
+                }
+
+                if (params.alg_temp == 0.0f) {
+                    // Deterministic - use confidence order
+                    for (int32_t i = 0; i < num_transfer; i++) {
+                        int32_t     mask_idx = confidences[i].second;
+                        int32_t     pos      = mask_positions[mask_idx];
+                        llama_token token    = sampled_tokens[mask_idx];
+                        output_tokens[pos]   = token;
+                    }
+                } else {
+                    for (int32_t i = 0; i < num_transfer; i++) {
+                        int32_t pos = confidences[i].second;
+                        auto    it  = std::find(mask_positions.begin(), mask_positions.end(), pos);
+                        if (it != mask_positions.end()) {
+                            int32_t mask_idx   = std::distance(mask_positions.begin(), it);
+                            output_tokens[pos] = sampled_tokens[mask_idx];
+                        }
+                    }
+                }
+            }
+        }
+        int64_t time_end_sampling = ggml_time_us();
+        total_sampling_time += time_end_sampling - time_start_sampling;
+    }
+    int64_t time_end = ggml_time_us();
+    total_time += time_end - time_start;
+
+    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
+            total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps);
+
+
+    llama_batch_free(batch);
+    llama_sampler_free(sampler);
+    llama_sampler_free(dist_sampler);
+
+    n_generated = max_length;
+}
+
+
+
+
+static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
+    if (!use_chat_template) {
+        return prompt;
+    }
+
+    auto chat_templates = common_chat_templates_init(model, "");
+
+    common_chat_templates_inputs inputs;
+    common_chat_msg              user_msg;
+    user_msg.role                = "user";
+    user_msg.content             = prompt;
+    inputs.add_generation_prompt = true;
+    inputs.messages.push_back(user_msg);
+
+    auto result = common_chat_templates_apply(chat_templates.get(), inputs);
+
+    return result.prompt;
+}
+
+struct callback_data {
+    const common_params_diffusion * diff_params;
+    const llama_vocab *             vocab;
+    int32_t                         n_input;
+};
+
+static bool diffusion_step_callback(int32_t step,
+                                    int32_t total_steps,
+                                    const llama_token * tokens,
+                                    int32_t n_tokens,
+                                    void * user_data) {
+    (void)user_data;
+
+    callback_data * data = static_cast<callback_data *>(user_data);
+
+    auto print_progress_bar = [](int32_t step, int32_t total_steps) {
+        int progress_percent = (step * 100) / total_steps;
+        int progress_bars    = (step * 50) / total_steps;
+        LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
+            step,
+            total_steps,
+            std::string(progress_bars, '=').c_str(),
+            std::string(50 - progress_bars, ' ').c_str(),
+            progress_percent);
+    };
+
+    if (data->diff_params->visual_mode) {
+        // Visual mode: clear
+        LOG_INF("\033[2J\033[H");  // Clear screen and move cursor to top-left
+
+        print_progress_bar(step, total_steps);
+
+        LOG_INF("\n");
+
+        std::string current_text = " ";
+
+        for (int32_t i = data->n_input; i < n_tokens; i++) {
+            std::string token_str;
+            if (tokens[i] != llama_vocab_mask(data->vocab)) {
+                char piece[256];
+                int  n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false);
+                if (n_chars > 0) {
+                    piece[n_chars] = '\0';
+                    token_str      = piece;
+                }
+            } else {
+                token_str = " ";
+            }
+
+            current_text += token_str;
+        }
+
+        LOG_INF("%s\n", current_text.c_str());
+    } else {
+        print_progress_bar(step, total_steps);
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
+        return 1;
+    }
+
+    const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" };
+    const char * alg_name    = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ?
+                                   alg_names[params.diffusion.algorithm] :
+                                   "UNKNOWN";
+
+    common_init();
+    llama_backend_init();
+
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers       = params.n_gpu_layers;
+    model_params.devices            = params.devices.data();
+    model_params.use_mmap           = params.use_mmap;
+    model_params.use_mlock          = params.use_mlock;
+    model_params.check_tensors      = params.check_tensors;
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+    if (!model) {
+        LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str());
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx                = params.n_ctx;
+    ctx_params.n_batch              = params.n_batch;
+    ctx_params.n_ubatch             = params.n_ubatch;
+    ctx_params.flash_attn           = params.flash_attn;
+    ctx_params.no_perf              = params.no_perf;
+    ctx_params.type_k               = params.cache_type_k;
+    ctx_params.type_v               = params.cache_type_v;
+
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    if (!ctx) {
+        LOG_ERR("error: failed to create context\n");
+        llama_model_free(model);
+        return 1;
+    }
+
+    llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);
+
+    const llama_vocab * vocab            = llama_model_get_vocab(model);
+    std::string         formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
+
+    std::vector<llama_token> input_tokens = common_tokenize(vocab, formatted_prompt,
+                                                            /*add special tokens*/ true,
+                                                            /*parse special*/ true);
+    int                      n_input      = input_tokens.size();
+
+    if (n_input >= params.n_ctx) {
+        LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
+        llama_free(ctx);
+        llama_model_free(model);
+        return 1;
+    }
+
+    struct diffusion_params ldiff_params = diffusion_default_params();
+    ldiff_params.steps                   = params.diffusion.steps;
+    ldiff_params.eps                     = params.diffusion.eps;
+    ldiff_params.temperature             = params.sampling.temp;
+    ldiff_params.top_p                   = params.sampling.top_p;
+    ldiff_params.top_k                   = params.sampling.top_k;
+    ldiff_params.algorithm               = static_cast<enum diffusion_alg>(params.diffusion.algorithm);
+    ldiff_params.alg_temp                = params.diffusion.alg_temp;
+    ldiff_params.seed                    = params.sampling.seed;
+
+    llama_token mask_token_id = llama_vocab_mask(vocab);
+    GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
+
+    LOG_INF("diffusion_params: - %-25s llama_token      = %d\n", "mask_token_id", mask_token_id);
+    LOG_INF("diffusion_params: - %-25s u32              = %d\n", "steps", params.diffusion.steps);
+    LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", params.diffusion.eps);
+    LOG_INF("diffusion_params: - %-25s u32              = %d (%s)\n", "algorithm", params.diffusion.algorithm,
+            alg_name);
+    LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", params.diffusion.alg_temp);
+
+    ldiff_params.mask_token_id = mask_token_id;
+
+    callback_data cb_data = { &params.diffusion, vocab, n_input };
+
+    ldiff_params.step_callback           = diffusion_step_callback;
+    ldiff_params.step_callback_user_data = &cb_data;
+
+    int32_t n_generated = 0;
+
+    std::vector<llama_token> output_tokens(params.n_ubatch);
+    diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch,
+                       ldiff_params, n_generated);
+
+    if (n_generated > 0) {
+        if (params.diffusion.visual_mode) {
+            //clear screen and move cursor to top-left
+            LOG_INF("\033[2J\033[H");
+        }
+        output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
+        std::string output_data = common_detokenize(vocab, output_tokens, false);
+        LOG_INF("\n%s\n", output_data.c_str());
+    } else {
+        LOG_INF("Error: diffusion generation failed\n");
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+    llama_backend_free();
+
+    return 0;
+}
@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
    const llama_vocab * vocab = llama_model_get_vocab(model);

    const int n_ctx_train = llama_model_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx       = llama_n_ctx(ctx);

    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);

@@ -224,6 +224,7 @@ int main(int argc, char ** argv) {
        auto & client = clients[i];
        client.id = i;
        client.smpl = common_sampler_init(model, params.sampling);
+        //params.sampling.seed++;
    }

    std::vector<llama_token> tokens_system;
@@ -345,7 +346,7 @@ int main(int argc, char ** argv) {
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;

-                    LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);
+                    LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, prompt = %d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur, client.n_prompt);

                    g_seq_id += 1;

@@ -4015,6 +4015,9 @@ static void ggml_compute_forward_rms_norm_f32(

                const float scale = 1.0f/sqrtf(mean + eps);

+                // if you hit this, likely you got an inf somewhere earlier
+                assert(scale > 0.0f);
+
                ggml_vec_scale_f32(ne00, y, scale);
            }
        }
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
    for (int i = np; i < n; ++i) {
        sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
    }
+
+    // if you hit this, you are likely running outside the FP range
+    assert(!isnan(sumf) && !isinf(sumf));
 #else
    for (int i = 0; i < n; ++i) {
        sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
@@ -33,8 +33,10 @@ typedef void (* fattn_kernel_t)(
        const int ne13,
        const int ne31,
        const int ne32,
+        const int ne33,
        const int nb31,
        const int nb32,
+        const int nb33,
        const int nb01,
        const int nb02,
        const int nb03,
@@ -521,7 +523,7 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
 template<int D, int ncols1, int ncols2> // D == head size
 __launch_bounds__(D, 1)
 static __global__ void flash_attn_stream_k_fixup(
-        float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) {
+        float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11) {
    constexpr int ncols = ncols1*ncols2;

    const int bidx0 = blockIdx.x;
@@ -535,8 +537,8 @@ static __global__ void flash_attn_stream_k_fixup(
    const int iter_k = ne11 / FATTN_KQ_STRIDE;
    const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;

-    const int kbc0      = (bidx0 + 0)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
-    const int kbc0_stop = (bidx0 + 1)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
+    const int kbc0      = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;

    const bool did_not_have_any_data   = kbc0 == kbc0_stop;
    const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@@ -545,14 +547,15 @@ static __global__ void flash_attn_stream_k_fixup(
        return;
    }

-    const int channel = kbc0 / (iter_k*iter_j);
-    const int jt      = (kbc0 - channel*iter_k*iter_j) / iter_k;
+    const int sequence = kbc0 / (iter_k*iter_j*(ne02/ncols2));
+    const int head = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
+    const int jt = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.

    if (jt*ncols1 + j >= ne01) {
        return;
    }

-    dst += jt*ne02*(ncols1*D) + channel*(ncols2*D) + (j*ne02 + c)*D + tid;
+    dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + head*(ncols2*D) + (j*ne02 + c)*D + tid;

    // Load the partial result that needs a fixup:
    float dst_val = 0.0f;
@@ -571,7 +574,7 @@ static __global__ void flash_attn_stream_k_fixup(
    int bidx = bidx0 - 1;
    int kbc_stop = kbc0;
    while(true) {
-        const int kbc = bidx*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
+        const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
        if (kbc == kbc_stop) { // Did not have any data.
            bidx--;
            kbc_stop = kbc;
@@ -617,16 +620,31 @@ static __global__ void flash_attn_combine_results(
        const float2 * __restrict__ VKQ_meta,
        float * __restrict__ dst,
        const int parallel_blocks) {
-    VKQ_parts += parallel_blocks*D * gridDim.z*blockIdx.x;
-    VKQ_meta  += parallel_blocks   * gridDim.z*blockIdx.x;
-    dst       +=                 D * gridDim.z*blockIdx.x;
+    // Dimension 0: threadIdx.x
+    // Dimension 1: blockIdx.x
+    // Dimension 2: blockIdx.y
+    // Dimension 3: blockIdx.z
+    // Memory layout is permuted with [0, 2, 1, 3]
+
+    const int ne01 = gridDim.x;
+    const int ne02 = gridDim.y;
+
+    const int col      = blockIdx.x;
+    const int head     = blockIdx.y;
+    const int sequence = blockIdx.z;
+
+    const int j_dst_unrolled = (sequence*ne01 + col)*ne02 + head;
+
+    VKQ_parts += j_dst_unrolled * parallel_blocks*D;
+    VKQ_meta  += j_dst_unrolled * parallel_blocks;
+    dst       += j_dst_unrolled *                 D;

    const int tid = threadIdx.x;
    __builtin_assume(tid < D);

    extern __shared__ float2 meta[];
    for (int i = tid; i < 2*parallel_blocks; i += D) {
-        ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
+        ((float *) meta)[i] = ((const float *)VKQ_meta) [i];
    }

    __syncthreads();
@@ -644,11 +662,11 @@ static __global__ void flash_attn_combine_results(
        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
        *((uint32_t *) &KQ_max_scale) &= ftz_mask;

-        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.z*D + blockIdx.z*D + tid];
+        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*D + tid];
        VKQ_denominator += KQ_max_scale * meta[l].y;
    }

-    dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
+    dst[tid] = VKQ_numerator / VKQ_denominator;
 }

 [[noreturn]]
@@ -705,8 +723,6 @@ void launch_fattn(

    GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");

-    GGML_ASSERT(Q->ne[3] == 1);
-
    ggml_cuda_pool & pool = ctx.pool();
    cudaStream_t main_stream = ctx.stream();
    const int id  = ggml_cuda_get_device();
@@ -853,8 +869,8 @@ void launch_fattn(
        scale, max_bias, m0, m1, n_head_log2, logit_softcap,
        Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
        K->ne[0], K->ne[1], K->ne[2], K->ne[3],
-        mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0,
-        mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0,
+        mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
+        mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0,
        Q->nb[1], Q->nb[2], Q->nb[3],
        nb11, nb12, nb13,
        nb21, nb22, nb23,
@@ -869,11 +885,11 @@ void launch_fattn(

            flash_attn_stream_k_fixup<DV, ncols1, ncols2>
                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
-                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
+                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1]);
        }
    } else if (parallel_blocks > 1) {
        const dim3 block_dim_combine(DV, 1, 1);
-        const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
+        const dim3 blocks_num_combine(Q->ne[1], Q->ne[2], Q->ne[3]);
        const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);

        flash_attn_combine_results<DV>
@@ -1224,8 +1224,10 @@ static __global__ void flash_attn_ext_f16(
        const int ne13,
        const int ne31,
        const int ne32,
+        const int ne33,
        const int nb31,
        const int nb32,
+        const int nb33,
        const int nb01,
        const int nb02,
        const int nb03,
@@ -1274,8 +1276,8 @@ static __global__ void flash_attn_ext_f16(
    constexpr int kb_niter = FATTN_KQ_STRIDE / c::nbatch_fa; // Number of kernel iterations per assigned KQ slice.

    // kbc == k block continuous, current index in continuous ijk space.
-    int       kbc      = (blockIdx.x + 0)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
-    const int kbc_stop = (blockIdx.x + 1)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
+    int       kbc      = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;

    // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
    // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
@@ -1285,18 +1287,19 @@ static __global__ void flash_attn_ext_f16(
    int kb0_start = kbc % iter_k;
    int kb0_stop  = min(iter_k, kb0_start + kbc_stop - kbc);
    while (kbc < kbc_stop && kb0_stop == iter_k) {
-        const int channel = kbc / (iter_k*iter_j);
-        const int jt      = (kbc - channel*iter_k*iter_j) / iter_k; // j index of current tile.
+        const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
+        const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
+        const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.

-        const float2 * Q_f2    = (const float2 *) (Q + nb02* channel*ncols2);
-        const half2  * K_h2    = (const half2  *) (K + nb12*(channel*ncols2 / gqa_ratio));
+        const float2 * Q_f2    = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2));
+        const half2  * K_h2    = (const half2  *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio));
        const half2  * mask_h2 = ncols2 == 1 && !mask ? nullptr :
-            (const half2  *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1);
-        float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * DV/2);
+            (const half2  *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1);
+        float2       * dstk    = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2);

-        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
+        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio));

-        const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f;
+        const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;

        const int kb0_start_kernel = kb0_start * kb_niter;
        const int kb0_stop_kernel  = kb0_stop  * kb_niter;
@@ -1325,18 +1328,19 @@ static __global__ void flash_attn_ext_f16(
        return;
    }

-    const int channel = kbc / (iter_k*iter_j);
-    const int jt      = (kbc - channel*iter_k*iter_j) / iter_k; // j index of current tile.
+    const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
+    const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
+    const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.

-    const float2 * Q_f2    = (const float2 *) (Q + nb02* channel*ncols2);
-    const half2  * K_h2    = (const half2  *) (K + nb12*(channel*ncols2 / gqa_ratio));
+    const float2 * Q_f2    = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2));
+    const half2  * K_h2    = (const half2  *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio));
    const half2  * mask_h2 = ncols2 == 1 && !mask ? nullptr :
-        (const half2  *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1);
-    float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * DV/2);
+        (const half2  *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1);
+    float2       * dstk    = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2);

-    const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
+    const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio));

-    const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f;
+    const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;

    const int kb0_start_kernel = kb0_start * kb_niter;
    const int kb0_stop_kernel  = kb0_stop  * kb_niter;
@@ -31,8 +31,10 @@ static __global__ void flash_attn_tile_ext_f16(
        const int ne13,
        const int ne31,
        const int ne32,
+        const int ne33,
        const int nb31,
        const int nb32,
+        const int nb33,
        const int nb01,
        const int nb02,
        const int nb03,
@@ -62,15 +64,17 @@ static __global__ void flash_attn_tile_ext_f16(

    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

+    const int sequence = blockIdx.z / ne02;
+    const int head = blockIdx.z - sequence*ne02;
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.z              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.z / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
-    const half   * maskh = (const half   *) (mask + nb32*(blockIdx.z % ne32)      + nb31*ic0);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *) (mask + nb33*(sequence % ne33)                           + nb31*ic0);

    const int stride_KV2 = nb11 / sizeof(half2);

-    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
@@ -255,6 +259,8 @@ static __global__ void flash_attn_tile_ext_f16(
        __syncthreads();
    }

+    float2 * dst2 = (float2 *) dst;
+
 #pragma unroll
    for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
        const int j_VKQ = j_VKQ_0 + threadIdx.y;
@@ -266,21 +272,21 @@ static __global__ void flash_attn_tile_ext_f16(
        half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
        kqsum_j = warp_reduce_sum((float)kqsum_j);

-#pragma unroll
-        for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
-            const int i0 = i00 + 2*threadIdx.x;
+        const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;

-            half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
+#pragma unroll
+        for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) {
+            const int i0 = i00 + threadIdx.x;
+
+            half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE];
            if (gridDim.y == 1) {
                dst_val /= __half2half2(kqsum_j);
            }
-            const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
-            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] =  __low2float(dst_val);
-            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = __high2float(dst_val);
+            dst2[j_dst_unrolled*(D/2) + i0] = __half22float2(dst_val);
        }

        if (gridDim.y != 1 && threadIdx.x == 0) {
-            dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+            dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
        }
    }
 #else
@@ -290,8 +296,8 @@ static __global__ void flash_attn_tile_ext_f16(
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
-    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
@@ -31,8 +31,10 @@ static __global__ void flash_attn_tile_ext_f32(
        const int ne13,
        const int ne31,
        const int ne32,
+        const int ne33,
        const int nb31,
        const int nb32,
+        const int nb33,
        const int nb01,
        const int nb02,
        const int nb03,
@@ -74,15 +76,17 @@ static __global__ void flash_attn_tile_ext_f32(

    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

+    const int sequence = blockIdx.z / ne02;
+    const int head = blockIdx.z - sequence*ne02;
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.z              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.z / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
-    const half   * maskh = (const half   *) (mask + nb32*(blockIdx.z % ne32)      + nb31*ic0);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *) (mask + nb33*(sequence % ne33)                           + nb31*ic0);

    const int stride_KV2 = nb11 / sizeof(half2);

-    const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
+    const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");

@@ -265,6 +269,8 @@ static __global__ void flash_attn_tile_ext_f32(
        __syncthreads();
    }

+    float2 * dst2 = (float2 *) dst;
+
 #pragma unroll
    for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
        const int j_VKQ = j_VKQ_0 + threadIdx.y;
@@ -276,22 +282,22 @@ static __global__ void flash_attn_tile_ext_f32(
        float kqsum_j = kqsum[j_VKQ_0/nwarps];
        kqsum_j = warp_reduce_sum(kqsum_j);

-#pragma unroll
-        for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
-            const int i0 = i00 + 2*threadIdx.x;
+        const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;

-            float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
+#pragma unroll
+        for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) {
+            const int i0 = i00 + threadIdx.x;
+
+            float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE];
            if (gridDim.y == 1) {
                dst_val.x /= kqsum_j;
                dst_val.y /= kqsum_j;
            }
-            const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
-            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] = dst_val.x;
-            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = dst_val.y;
+            dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
        }

        if (gridDim.y != 1 && threadIdx.x == 0) {
-            dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+            dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
        }
    }
 #else
@@ -28,8 +28,10 @@ static __global__ void flash_attn_vec_ext_f16(
        const int ne13,
        const int ne31,
        const int ne32,
+        const int ne33,
        const int nb31,
        const int nb32,
+        const int nb33,
        const int nb01,
        const int nb02,
        const int nb03,
@@ -65,14 +67,16 @@ static __global__ void flash_attn_vec_ext_f16(

    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

+    const int sequence = blockIdx.z / ne02;
+    const int head = blockIdx.z - sequence*ne02;
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    Q += nb02* blockIdx.z              + nb01*ic0;
-    K += nb12*(blockIdx.z / gqa_ratio);
-    V += nb22*(blockIdx.z / gqa_ratio);
+    Q += nb03*sequence + nb02* head              + nb01*ic0;
+    K += nb13*sequence + nb12*(head / gqa_ratio);
+    V += nb23*sequence + nb22*(head / gqa_ratio);

-    const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
+    const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);

-    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
@@ -330,12 +334,11 @@ static __global__ void flash_attn_vec_ext_f16(
        if (gridDim.y == 1) {
            dst_val /= kqsum[j_VKQ];
        }
-        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
-        dst[j_dst*D*gridDim.z + D*blockIdx.z + tid] = dst_val;
+        dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val;
    }

    if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
-        dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
+        dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
@@ -344,8 +347,8 @@ static __global__ void flash_attn_vec_ext_f16(
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
-    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne32);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
@@ -28,8 +28,10 @@ static __global__ void flash_attn_vec_ext_f32(
        const int ne13,
        const int ne31,
        const int ne32,
+        const int ne33,
        const int nb31,
        const int nb32,
+        const int nb33,
        const int nb01,
        const int nb02,
        const int nb03,
@@ -53,8 +55,8 @@ static __global__ void flash_attn_vec_ext_f32(
        GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
        GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
        GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
-        GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-        GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+        GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
+        GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
        GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
        GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
        GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
@@ -77,14 +79,16 @@ static __global__ void flash_attn_vec_ext_f32(

    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

+    const int sequence = blockIdx.z / ne02;
+    const int head = blockIdx.z - sequence*ne02;
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    Q += nb02* blockIdx.z              + nb01*ic0;
-    K += nb12*(blockIdx.z / gqa_ratio);
-    V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape
+    Q += nb03*sequence + nb02* head              + nb01*ic0;
+    K += nb13*sequence + nb12*(head / gqa_ratio);
+    V += nb23*sequence + nb22*(head / gqa_ratio);

-    const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
+    const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);

-    const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
+    const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
@@ -326,12 +330,11 @@ static __global__ void flash_attn_vec_ext_f32(
        if (gridDim.y == 1) {
            dst_val /= kqsum[j_VKQ];
        }
-        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
-        dst[j_dst*D*gridDim.z + D*blockIdx.z + tid] = dst_val;
+        dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val;
    }

    if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
-        dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
+        dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
@@ -340,8 +343,8 @@ static __global__ void flash_attn_vec_ext_f32(
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
-    GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-    GGML_UNUSED(nb31); GGML_UNUSED(nb32);
+    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
@@ -47,8 +47,10 @@ static __global__ void flash_attn_ext_f16(
        const int ne13,
        const int ne31,
        const int ne32,
+        const int ne33,
        const int nb31,
        const int nb32,
+        const int nb33,
        const int nb01,
        const int nb02,
        const int nb03,
@@ -95,17 +97,19 @@ static __global__ void flash_attn_ext_f16(
    constexpr int kqs_padded = FATTN_KQ_STRIDE + 8;
    constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);

+    const int sequence = blockIdx.z / ne02;
+    const int head = blockIdx.z - sequence*ne02;
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float * Q_f   = (const float *) (Q    + nb02* blockIdx.z              + nb01*ic0);
-    const half  * K_h   = (const half  *) (K    + nb12*(blockIdx.z / gqa_ratio));
-    const half  * V_h   = (const half  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
-    const half  * maskh = (const half  *) (mask + nb32*(blockIdx.z % ne32)      + nb31*ic0);
+    const float * Q_f   = (const float *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
+    const half  * K_h   = (const half  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
+    const half  * V_h   = (const half  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
+    const half  * maskh = (const half  *) (mask + nb33*(sequence % ne33)                           + nb31*ic0);
    const half2 * mask2 = (const half2 *)  maskh;

    const int stride_Q  = nb01 / sizeof(float);
    const int stride_KV = nb11 / sizeof(half);

-    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    const half2 slope2 = make_half2(slopef, slopef);

@@ -400,7 +404,6 @@ static __global__ void flash_attn_ext_f16(
        if (ic0 + j_VKQ >= ne01) {
            return;
        }
-        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;

        float KQ_rowsum_j;
        if (std::is_same<KQ_acc_t, float>::value) {
@@ -409,6 +412,8 @@ static __global__ void flash_attn_ext_f16(
            KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
        }

+        const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
+
 #pragma unroll
        for (int i0 = 0; i0 < D; i0 += warp_size) {
            const int i = i0 + threadIdx.x;
@@ -419,7 +424,7 @@ static __global__ void flash_attn_ext_f16(
            if (gridDim.y == 1) {
                dst_val /= KQ_rowsum_j;
            }
-            dst[j_dst*gridDim.z*D + blockIdx.z*D + i] = dst_val;
+            dst[j_dst_unrolled*D + i] = dst_val;
        }

        if (gridDim.y == 1 || threadIdx.x != 0) {
@@ -433,7 +438,7 @@ static __global__ void flash_attn_ext_f16(
            dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]);
        }
        dst_meta_val.y = KQ_rowsum_j;
-        dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val;
+        dst_meta[j_dst_unrolled] = dst_meta_val;
    }
 #else
    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
@@ -442,7 +447,8 @@ static __global__ void flash_attn_ext_f16(
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
-    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); GGML_UNUSED(nb31);
+    GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
    GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
@@ -3413,12 +3413,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            if (op->src[0]->ne[0] == 192) {
                return false;
            }
-            // TODO: support broadcast
-            // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but
-            //       the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
-            if (op->src[0]->ne[3] != 1) {
-                return false;
-            }
            if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
                return false;
            }
@@ -3431,6 +3425,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
                return true;
            }
+            if (op->src[3] && op->src[3]->ne[2] != 1) {
+                return false;
+            }
            return fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) &&
                op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
        }
@@ -4922,7 +4922,7 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
    return
        tensor->nb[0] == ggml_type_size(tensor->type) &&
        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+        (tensor->ne[3] == 1 || tensor->nb[3] == tensor->nb[2]*tensor->ne[2]);
 }

 static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
@@ -10356,10 +10356,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                        // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
                        return false;
                    }
-                    // Check against size of shared memory variable
-                    if (op->src[2]->ne[0] > 4096) {
-                        return false;
-                    }
                }
                switch (src0_type) {
                    case GGML_TYPE_F32:
@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
        const uint i = gl_WorkGroupID.x * 256 + wgy;
-        if (i >= p.M * p.K / QUANT_K) {
+        if (i >= p.nel / QUANT_K) {
            return;
        }

@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
        const uint i = uint(gl_WorkGroupID.x * 256 + wgy);
-        if (i >= p.M * p.K / QUANT_K) {
+        if (i >= p.nel / QUANT_K) {
            return;
        }

@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
        const uint ib = gl_WorkGroupID.x * 256 + wgy;
-        if (ib >= p.M * p.K / QUANT_K) {
+        if (ib >= p.nel / QUANT_K) {
            return;
        }

@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
        const uint ib = gl_WorkGroupID.x * 256 + wgy;
-        if (ib >= p.M * p.K / QUANT_K) {
+        if (ib >= p.nel / QUANT_K) {
            return;
        }

@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
        const uint i = gl_WorkGroupID.x * 256 + wgy;
-        if (i >= p.M * p.K / QUANT_K) {
+        if (i >= p.nel / QUANT_K) {
            return;
        }
        const uint tid = gl_LocalInvocationID.x;
@@ -367,6 +367,7 @@ class MODEL_ARCH(IntEnum):
    HUNYUAN_MOE      = auto()
    SMOLLM3          = auto()
    LFM2             = auto()
+    DREAM            = auto()


 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -683,6 +684,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.HUNYUAN_MOE:      "hunyuan-moe",
    MODEL_ARCH.SMOLLM3:          "smollm3",
    MODEL_ARCH.LFM2:             "lfm2",
+    MODEL_ARCH.DREAM:            "dream",
 }

 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -1289,6 +1291,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.DREAM: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    MODEL_ARCH.QWEN2VL: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@@ -234,6 +234,8 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
    markdown_content += '## Key Value Metadata Store\n\n'
    markdown_content += f'There are {len(reader.fields)} key-value pairs in this file\n'
    markdown_content += '\n'
+    total_model_bytes = 0
+    total_model_elements = 0

    kv_dump_table: list[dict[str, str | int]] = []
    for n, field in enumerate(reader.fields.values(), 1):
@@ -377,6 +379,8 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
            tensors = tensor_groups[group]
            group_elements = sum(tensor.n_elements for tensor in tensors)
            group_percentage = group_elements / total_elements * 100
+            total_group_bytes = 0
+            total_group_elements = 0
            markdown_content += f"### <a name=\"{group.replace('.', '_')}\">{translate_tensor_name(group)} Tensor Group : {element_count_rounded_notation(group_elements)} Elements</a>\n\n"

            # Precalculate column sizing for visual consistency
@@ -397,7 +401,13 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
                element_count_est = f"({element_count_rounded_notation(tensor.n_elements):>{prettify_element_est_count_size}})"
                element_count_string = f"{element_count_est} {tensor.n_elements:>{prettify_element_count_size}}"
                type_name_string = f"{tensor.tensor_type.name}"
-                tensor_dump_table.append({"t_id":tensor_name_to_key[tensor.name], "layer_name":tensor.name, "human_layer_name":human_friendly_name, "element_count":element_count_string, "pretty_dimension":pretty_dimension, "tensor_type":type_name_string})
+                if tensor.n_elements > 0:
+                    bpw = (tensor.n_bytes * 8) / tensor.n_elements
+                else:
+                    bpw = float('nan')
+                tensor_dump_table.append({"t_id":tensor_name_to_key[tensor.name], "layer_name":tensor.name, "human_layer_name":human_friendly_name, "element_count":element_count_string, "pretty_dimension":pretty_dimension, "tensor_type":type_name_string, "bpw": f"{bpw:.4f}"})
+                total_group_bytes += tensor.n_bytes
+                total_group_elements += tensor.n_elements

            tensor_dump_table_header_map = [
                {'key_name':'t_id',             'header_name':'T_ID',                             'align':'right'},
@@ -406,6 +416,7 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
                {'key_name':'element_count',    'header_name':'Elements',                         'align':'left'},
                {'key_name':'pretty_dimension', 'header_name':'Shape',                            'align':'left'},
                {'key_name':'tensor_type',      'header_name':'Type',                             'align':'left'},
+                {'key_name':'bpw',              'header_name':'BPW',                              'align':'right'},
            ]

            markdown_content += markdown_table_with_alignment_support(tensor_dump_table_header_map, tensor_dump_table)
@@ -413,8 +424,20 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
            markdown_content += "\n"
            markdown_content += f"- Total elements in {group}: ({element_count_rounded_notation(group_elements):>4}) {group_elements}\n"
            markdown_content += f"- Percentage of total elements: {group_percentage:.2f}%\n"
+            if total_group_elements > 0:
+                total_group_bpw = (total_group_bytes * 8) / total_group_elements
+                markdown_content += f"- Bits per Weight (BPW) for {group}: {total_group_bpw:.4f} bits\n"
+            else:
+                markdown_content += f"- Bits per Weight (BPW) for {group}: undefined (no elements)\n"
            markdown_content += "\n\n"
+            total_model_bytes += total_group_bytes
+            total_model_elements += total_group_elements

+    if total_model_elements > 0:
+        total_model_bpw = (total_model_bytes * 8) / total_model_elements
+        markdown_content += f"Total BPW for {os.path.basename(args.model)}: {total_model_bpw:.4f} bits"
+    else:
+        markdown_content += f"Total BPW for {os.path.basename(args.model)}: undefined (no elements)"
    print(markdown_content)  # noqa: NP100


@@ -335,6 +335,9 @@ extern "C" {
        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
+                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
    };

    // model quantization parameters
@@ -725,7 +728,7 @@ extern "C" {
    //   - lazily on next llama_decode()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    DEPRECATED(void llama_kv_self_seq_div(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@@ -1005,6 +1008,7 @@ extern "C" {
    LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
    LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
    LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
+    LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask

    LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
    LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -0,0 +1,43 @@
+{%- if tools -%}
+  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>
+{%- endif -%}
+{%- for message in messages -%}
+  {%- if loop.first and messages[0]['role'] != 'system' -%}
+    <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>
+  {%- endif -%}
+  {%- if message['role'] == 'system' -%}
+    <|im_system|>system<|im_middle|>
+  {%- elif message['role'] == 'user' -%}
+    <|im_user|>user<|im_middle|>
+  {%- elif message['role'] == 'assistant' -%}
+    <|im_assistant|>assistant<|im_middle|>
+  {%- elif message['role'] == 'tool' -%}
+    <|im_system|>tool<|im_middle|>
+  {%- endif -%}
+  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
+    {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}
+    <|tool_calls_section_begin|>
+    {%- for tool_call in message['tool_calls'] -%}
+      {%- set func_name = tool_call['function']['name'] -%}
+      {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}
+      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>
+    {%- endfor -%}
+    <|tool_calls_section_end|>
+  {%- elif message['role'] == 'tool' -%}
+    ## Return of {{ message.tool_call_id }}\n{{ message['content'] }}
+  {%- elif message['content'] is string -%}
+    {{ message['content'] }}
+  {%- elif message['content'] is not none -%}
+    {% for content in message['content'] -%}
+      {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
+        <|media_start|>image<|media_content|><|media_pad|><|media_end|>
+      {% else -%}
+        {{ content['text'] }}
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif -%}
+  <|im_end|>
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+  <|im_assistant|>assistant<|im_middle|>
+{%- endif -%}
@@ -2,9 +2,11 @@

 import argparse
 import json
+import os
+import random
 import subprocess
 from time import sleep, time
-from typing import Optional
+from typing import Optional, Union

 import datasets
 import logging
@@ -18,31 +20,39 @@ logging.basicConfig(level=logging.INFO, format='%(message)s')
 logger = logging.getLogger("server-bench")


-def get_prompts(n_prompts: int) -> list[str]:
-    logger.info("Loading MMLU dataset...")
-    ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"]  # type: ignore
+def get_prompts_text(dataset_name: str, n_prompts: int) -> Optional[list[str]]:
+    ret = []
+    if dataset_name.lower() == "mmlu":
+        logger.info("Loading MMLU dataset...")
+        ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"]  # type: ignore
+    else:
+        return None
    if n_prompts >= 0:
        ret = ret[:n_prompts]
    return ret


-def get_server(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int) -> dict:
-    logger.info("Starting the llama.cpp server...")
-    address = f"http://localhost:{port}"
+def get_prompt_lengths_rng(n_prompts: int, prompt_length_min: int, prompt_length_max: int) -> list[int]:
+    assert n_prompts >= 0
+    ret: list[int] = []
+    for i in range(n_prompts):
+        random.seed(13 * i + 0)
+        ret.append(random.randint(prompt_length_min, prompt_length_max))
+    return ret

-    popen_args: list[str] = [
-        path_server,
-        "--flash-attn",
-        "--n-gpu-layers", str(n_gpu_layers),
-        "--parallel", str(parallel),
-        "--ctx-size", str(parallel * ctx_size),
-        "--model", path_model,
-        "--port", str(port),
-        "--swa-full",  # FIXME performance bad otherwise
-        # "--attn-streams",
-    ]
-    fout = open("bench.log", "w") if path_log is not None else subprocess.DEVNULL
-    process = subprocess.Popen(popen_args, stdout=fout, stderr=subprocess.STDOUT)
+
+def get_prompts_rng(prompt_lengths: list[int]) -> list[list[int]]:
+    return [[random.randint(100, 10000) for _ in range(pl)] for pl in prompt_lengths]
+
+
+def get_server(path_server: str, path_log: Optional[str]) -> dict:
+    logger.info("Starting the llama.cpp server...")
+    hostname: str = os.environ.get("LLAMA_ARG_HOST", "127.0.0.1")
+    port: str = os.environ.get("LLAMA_ARG_PORT", "8080")
+    address: str = f"http://{hostname}:{port}"
+
+    fout = open(path_log, "w") if path_log is not None else subprocess.DEVNULL
+    process = subprocess.Popen([path_server], stdout=fout, stderr=subprocess.STDOUT)

    n_failures: int = 0
    while True:
@@ -50,14 +60,14 @@ def get_server(path_server: str, path_model: str, path_log: Optional[str], port:
            sleep(1.0)
            exit_code = process.poll()
            if exit_code is not None:
-                raise RuntimeError(f"llama.cpp server for {path_model} exited unexpectedly with exit code {exit_code}")
+                raise RuntimeError(f"llama.cpp server exited unexpectedly with exit code {exit_code}, see {path_log}")
            response = requests.get(f"{address}/health")
            if response.status_code == 200:
                break
        except requests.ConnectionError:
            n_failures += 1
            if n_failures >= 10:
-                raise RuntimeError(f"llama.cpp server for {path_model} is not healthy after 10 seconds")
+                raise RuntimeError("llama.cpp server is not healthy after 10 seconds")

    return {"process": process, "address": address, "fout": fout}

@@ -87,58 +97,97 @@ def send_prompt(data: dict) -> tuple[float, list[float]]:
    session = data["session"]
    server_address: str = data["server_address"]

-    response = session.post(
-        f"{server_address}/apply-template",
-        json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
-    )
-    if response.status_code != 200:
-        raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
-    prompt: str = json.loads(response.text)["prompt"]
+    t_submit = time()
+    if data["synthetic_prompt"]:
+        json_data: dict = {
+            "prompt": data["prompt"], "ignore_eos": True, "cache_prompt": False,
+            "seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
+        response = session.post(f"{server_address}/completion", json=json_data, stream=True)
+    else:
+        response = session.post(
+            f"{server_address}/apply-template",
+            json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
+        )
+        if response.status_code != 200:
+            raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
+        prompt: str = json.loads(response.text)["prompt"]

-    json_data: dict = {"prompt": prompt, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
-    response = session.post(f"{server_address}/completion", json=json_data, stream=True)
+        json_data: dict = {"prompt": prompt, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
+        response = session.post(f"{server_address}/completion", json=json_data, stream=True)

-    last_valid_line: str = ""
    token_arrival_times: list[float] = []
-    for line in response.iter_lines(decode_unicode=True):
-        if not line.startswith("data: "):
+    for line in response.iter_lines(decode_unicode=False):
+        if not line.startswith(b"data: "):
            continue
-        last_valid_line = line
        token_arrival_times.append(time())
    token_arrival_times = token_arrival_times[:-1]

    if response.status_code != 200:
        raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
-    timings: dict = json.loads(last_valid_line[6:])["timings"]

-    return (timings["prompt_ms"], token_arrival_times)
+    return (t_submit, token_arrival_times)


-def benchmark(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int, n_prompts: int, n_predict: int):
-    num_workers: int = parallel + 1
-    prompts: list[str] = get_prompts(n_prompts)
+def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_prompts: int, n_predict: int, n_predict_min: int):
+    if os.environ.get("LLAMA_ARG_N_PARALLEL") is None:
+        logger.info("LLAMA_ARG_N_PARALLEL not explicitly set, using 32")
+        os.environ["LLAMA_ARG_N_PARALLEL"] = "32"
+    if os.environ.get("LLAMA_ARG_N_GPU_LAYERS") is None:
+        logger.info("LLAMA_ARG_N_GPU_LAYERS not explicitly set, using 999")
+        os.environ["LLAMA_ARG_N_GPU_LAYERS"] = "999"
+    if os.environ.get("LLAMA_ARG_FLASH_ATTN") is None:
+        logger.info("LLAMA_ARG_FLASH_ATTN not explicitly set, using 'true'")
+        os.environ["LLAMA_ARG_FLASH_ATTN"] = "true"
+
+    parallel: int = int(os.environ.get("LLAMA_ARG_N_PARALLEL", 1))
+    prompts: Union[None, list[str], list[list[int]]] = get_prompts_text(prompt_source, n_prompts)
+    synthetic_prompts: bool = prompts is None
+    prompt_n = []
+
+    if synthetic_prompts:
+        prompt_source_split: list[str] = prompt_source.split("-")
+        assert len(prompt_source_split) == 3
+        assert prompt_source_split[0].lower() == "rng"
+        prompt_length_min: int = int(prompt_source_split[1])
+        prompt_length_max: int = int(prompt_source_split[2])
+        logger.info("Generating random prompts...")
+        prompt_n = get_prompt_lengths_rng(n_prompts, prompt_length_min, prompt_length_max)
+        prompts = get_prompts_rng(prompt_n)
+    else:
+        n_predict_min = n_predict
+
+    if os.environ.get("LLAMA_ARG_CTX_SIZE") is None:
+        context_per_slot: int = int(1.05 * (n_predict + (np.max(prompt_n) if synthetic_prompts else 2048)))
+        context_total: int = context_per_slot * parallel
+        os.environ["LLAMA_ARG_CTX_SIZE"] = str(context_total)
+        logger.info(f"LLAMA_ARG_CTX_SIZE not explicitly set, using {context_total} ({context_per_slot} per slot).")

    server: Optional[dict] = None
    session = None
    try:
-        server = get_server(path_server, path_model, path_log, port, n_gpu_layers, parallel, ctx_size)
+        server = get_server(path_server, path_log)
        server_address: str = server["address"]

-        adapter = requests.adapters.HTTPAdapter(pool_connections=num_workers, pool_maxsize=num_workers)  # type: ignore
+        adapter = requests.adapters.HTTPAdapter(pool_connections=parallel, pool_maxsize=parallel)  # type: ignore
        session = requests.Session()
        session.mount("http://", adapter)
        session.mount("https://", adapter)

        data: list[dict] = []
-        for i, p in enumerate(prompts):
-            data.append({"session": session, "server_address": server_address, "prompt": p, "n_predict": n_predict, "seed": i})

-        logger.info("Getting the prompt lengths...")
-        prompt_n = [get_prompt_length(d) for d in data]
+        for i, p in enumerate(prompts):
+            random.seed(13 * i + 1)
+            data.append({
+                "session": session, "server_address": server_address, "prompt": p, "synthetic_prompt": synthetic_prompts,
+                "n_predict": random.randint(n_predict_min, n_predict), "seed": 13 * i + 2})
+
+        if not synthetic_prompts:
+            logger.info("Getting the prompt lengths...")
+            prompt_n = [get_prompt_length(d) for d in data]

        logger.info("Starting the benchmark...\n")
        t0 = time()
-        results: list[tuple[int, list[float]]] = thread_map(send_prompt, data, max_workers=num_workers, chunksize=1)
+        results: list[tuple[float, list[float]]] = thread_map(send_prompt, data, max_workers=parallel, chunksize=1)
    finally:
        if server is not None:
            server["process"].terminate()
@@ -146,17 +195,18 @@ def benchmark(path_server: str, path_model: str, path_log: Optional[str], port:
        if session is not None:
            session.close()

-    prompt_ms = []
+    prompt_t = []
    token_t = []
    depth_sum: int = 0
-    for pn, (pms, tat) in zip(prompt_n, results):
-        prompt_ms.append(pms)
+    for pn, (t_submit, tat) in zip(prompt_n, results):
+        prompt_t.append(tat[0] - t_submit)
        token_t += tat
        n_tokens: int = len(tat)
        depth_sum += n_tokens * pn
        depth_sum += n_tokens * (n_tokens + 1) // 2
+    assert len(token_t) > 0
    prompt_n = np.array(prompt_n, dtype=np.int64)
-    prompt_ms = np.array(prompt_ms, dtype=np.float64)
+    prompt_t = np.array(prompt_t, dtype=np.float64)
    token_t = np.array(token_t, dtype=np.float64)

    token_t -= t0
@@ -167,18 +217,21 @@ def benchmark(path_server: str, path_model: str, path_log: Optional[str], port:
    logger.info(f"Request throughput:                {n_prompts / token_t_last:.2f} requests/s = {n_prompts / (token_t_last/60):.2f} requests/min")
    logger.info(f"Total prompt length:               {np.sum(prompt_n)} tokens")
    logger.info(f"Average prompt length:             {np.mean(prompt_n):.2f} tokens")
-    logger.info(f"Average prompt latency:            {np.mean(prompt_ms):.2f} ms")
-    logger.info(f"Average prompt speed:              {np.sum(prompt_n) / (1e-3 * np.sum(prompt_ms)):.2f} tokens/s")
+    logger.info(f"Average prompt latency:            {1e3 * np.mean(prompt_t):.2f} ms")
+    logger.info(f"Average prompt speed:              {np.sum(prompt_n) / np.sum(prompt_t):.2f} tokens/s")
    logger.info(f"Total generated tokens:            {token_t.shape[0]}")
    logger.info(f"Average generation depth:          {depth_sum / token_t.shape[0]:.2f} tokens")
    logger.info(f"Average total generation speed:    {token_t.shape[0] / token_t_last:.2f} tokens/s")
    logger.info(f"Average generation speed per slot: {token_t.shape[0] / (parallel * token_t_last):.2f} tokens/s / slot")
+    logger.info("")
+    logger.info(
+        "The above numbers are the speeds as observed by the Python script and may differ from the performance reported by the server, "
+        "particularly when the server is fast vs. the network or Python script (e.g. when serving a very small model).")

    plt.figure()
-    plt.scatter(prompt_n, prompt_ms, s=10.0, marker=".", alpha=0.25)
-    plt.xlim(0, 1.05 * np.max(prompt_n))
-    plt.ylim(0, 1.05 * np.max(prompt_ms))
-    plt.title(path_model)
+    plt.scatter(prompt_n, 1e3 * prompt_t, s=10.0, marker=".", alpha=0.25)
+    plt.xlim(0, 1.05e0 * np.max(prompt_n))
+    plt.ylim(0, 1.05e3 * np.max(prompt_t))
    plt.xlabel("Prompt length [tokens]")
    plt.ylabel("Time to first token [ms]")
    plt.savefig("prompt_time.png", dpi=240)
@@ -187,7 +240,6 @@ def benchmark(path_server: str, path_model: str, path_log: Optional[str], port:
    plt.figure()
    plt.hist(token_t, np.arange(0, bin_max))
    plt.xlim(0, bin_max + 1)
-    plt.title(path_model)
    plt.xlabel("Time [s]")
    plt.ylabel("Num. tokens generated per second")
    plt.savefig("gen_rate.png", dpi=240)
@@ -196,15 +248,18 @@ def benchmark(path_server: str, path_model: str, path_log: Optional[str], port:
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Tool for benchmarking the throughput of the llama.cpp HTTP server. "
-        "Results are printed to console and visualized as plots (saved to current working directory).")
+        "Results are printed to console and visualized as plots (saved to current working directory). "
+        "To pass arguments such as the model path to the server, set the corresponding environment variables (see llama-server --help).")
    parser.add_argument("--path_server", type=str, default="llama-server", help="Path to the llama.cpp server binary")
-    parser.add_argument("--path_model", type=str, required=True, help="Path to the model to use for the benchmark")
-    parser.add_argument("--path_log", type=str, default=None, help="Path to the model to use for the benchmark")
-    parser.add_argument("--port", type=int, default=18725, help="Port to use for the server during the benchmark")
-    parser.add_argument("--n_gpu_layers", type=int, default=999, help="Number of GPU layers for the server")
-    parser.add_argument("--parallel", type=int, default=16, help="Number of slots for the server")
-    parser.add_argument("--ctx_size", type=int, default=4096, help="Server context size per slot")
-    parser.add_argument("--n_prompts", type=int, default=1000, help="Number of prompts to evaluate")
+    parser.add_argument("--path_log", type=str, default="server-bench.log", help="Path to the model to use for the benchmark")
+    parser.add_argument(
+        "--prompt_source", type=str, default="rng-1024-2048",
+        help="How to get the prompts for the benchmark, either 'mmlu' for MMLU questions or "
+        "rng-MIN-MAX for synthetic prompts with random lengths in the interval [MIN, MAX]")
+    parser.add_argument("--n_prompts", type=int, default=100, help="Number of prompts to evaluate")
    parser.add_argument("--n_predict", type=int, default=2048, help="Max. number of tokens to predict per prompt")
+    parser.add_argument(
+        "--n_predict_min", type=int, default=1024,
+        help="Min. number of tokens to predict per prompt (supported for synthetic prompts only)")
    args = parser.parse_args()
    benchmark(**vars(args))
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      },
    { LLM_ARCH_SMOLLM3,          "smollm3"          },
    { LLM_ARCH_LFM2,             "lfm2"             },
+    { LLM_ARCH_DREAM,            "dream"            },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@@ -1891,6 +1892,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
        },
    },
+    {
+        LLM_ARCH_DREAM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
 };

 static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -2133,3 +2151,12 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
            return false;
    }
 }
+
+bool llm_arch_is_diffusion(const llm_arch & arch) {
+    switch (arch) {
+        case LLM_ARCH_DREAM:
+            return true;
+        default:
+            return false;
+    }
+}
@@ -89,6 +89,7 @@ enum llm_arch {
    LLM_ARCH_HUNYUAN_MOE,
    LLM_ARCH_SMOLLM3,
    LLM_ARCH_LFM2,
+    LLM_ARCH_DREAM,
    LLM_ARCH_UNKNOWN,
 };

@@ -479,3 +480,4 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);

 bool llm_arch_is_recurrent(const llm_arch & arch);
 bool llm_arch_is_hybrid   (const llm_arch & arch);
+bool llm_arch_is_diffusion(const llm_arch & arch);
@@ -27,6 +27,7 @@ bool llama_batch_allocr::init(
        const llama_vocab & vocab,
        const llama_memory_i * memory,
        uint32_t n_embd,
+        uint32_t n_seq_max,
        bool output_all) {
    clear();

@@ -40,6 +41,11 @@ bool llama_batch_allocr::init(
    // validate input batch
    //

+    if (n_seq_max > LLAMA_MAX_SEQ) {
+        LLAMA_LOG_ERROR("%s: n_seq_max = %d > %d\n", __func__, n_seq_max, LLAMA_MAX_SEQ);
+        return false;
+    }
+
    if (batch.token) {
        for (int32_t i = 0; i < batch.n_tokens; ++i) {
            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
@@ -52,8 +58,8 @@ bool llama_batch_allocr::init(
    if (batch.seq_id) {
        for (int32_t i = 0; i < batch.n_tokens; ++i) {
            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
+                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
                    return false;
                }
            }
@@ -86,7 +92,7 @@ bool llama_batch_allocr::init(

        // initialize the starting position for each sequence based on the positions in the memory
        llama_pos p0[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
            if (!memory) {
                // if no memory -> start from 0
                p0[s] = 0;
@@ -143,7 +149,8 @@ bool llama_batch_allocr::init(
    // compute stats
    //

-    this->n_embd = n_embd;
+    this->n_embd    = n_embd;
+    this->n_seq_max = n_seq_max;

    // count the outputs in this batch
    for (int32_t i = 0; i < batch.n_tokens; ++i) {
@@ -189,7 +196,7 @@ bool llama_batch_allocr::init(
            seq_set_map[cur].push_back(i);
        }

-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
            if (seq_set_unq.test(s)) {
                seq_idx[s] = seq_id_unq.size();
                seq_id_unq.push_back(s);
@@ -241,7 +248,7 @@ bool llama_batch_allocr::init(
    // consistency checks
    //

-    for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+    for (uint32_t s = 0; s < n_seq_max; ++s) {
        if (seq_pos[s].empty()) {
            continue;
        }
@@ -284,8 +291,8 @@ bool llama_batch_allocr::init(
    }

    if (memory) {
-        for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
-            for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
+        for (uint32_t s0 = 0; s0 < n_seq_max; ++s0) {
+            for (uint32_t s1 = 0; s1 < n_seq_max; ++s1) {
                if (seq_cpl[s0][s1]) {
                    if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
                        memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
@@ -316,12 +323,12 @@ bool llama_batch_allocr::init(
    //
    {
        seq_set_t cur_seq_set[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
            cur_seq_set[s].set();
        }

        llama_pos cur_seq_pos[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
            cur_seq_pos[s] = -1;
        }

@@ -692,7 +699,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }
    }

-    for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+    for (uint32_t s = 0; s < n_seq_max; ++s) {
        if (seq_set_unq.test(s)) {
            ubatch.seq_idx[s] = ubatch.seq_id_unq.size();
            ubatch.seq_id_unq.push_back(s);
@@ -48,6 +48,7 @@ public:
            const llama_vocab & vocab,
            const llama_memory_i * memory,
            uint32_t n_embd,
+            uint32_t n_seq_max,
            bool output_all);

    const llama_batch & get_batch() const;
@@ -100,6 +101,7 @@ private:
    const uint32_t n_pos_per_embd;

    uint32_t n_embd;
+    uint32_t n_seq_max;
    uint32_t n_outputs;

    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
@@ -65,6 +65,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
    { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
    { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
+    { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -188,6 +189,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_DOTS1;
    } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
        return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
+    } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
+        return LLM_CHAT_TEMPLATE_KIMI_K2;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -680,6 +683,26 @@ int32_t llm_chat_apply_template(
                ss << "<|startoftext|>" << message->content << "<|extra_0|>";
            }
        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
+        // moonshotai/Kimi-K2-Instruct
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "<|im_system|>system<|im_middle|>";
+            } else if (role == "user") {
+                ss << "<|im_user|>user<|im_middle|>";
+            } else if (role == "assistant") {
+                ss << "<|im_assistant|>assistant<|im_middle|>";
+            } else if (role == "tool") {
+                ss << "<|im_system|>tool<|im_middle|>";
+            }
+
+            ss << message->content << "<|im_end|>";
+
+            if (add_ass) {
+                ss << "<|im_assistant|>assistant<|im_middle|>";
+            }
+        }
    } else {
        // template not supported
        return -1;
@@ -45,6 +45,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_SMOLVLM,
    LLM_CHAT_TEMPLATE_DOTS1,
    LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
+    LLM_CHAT_TEMPLATE_KIMI_K2,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

@@ -98,10 +98,20 @@ llama_context::llama_context(
        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
        cparams.n_batch = GGML_KQ_MASK_PAD;
    }
-
    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

    cparams.op_offload = params.op_offload;
+    cparams.kv_unified = params.kv_unified;
+
+    {
+        const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
+        const bool supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
+
+        if (!supports_set_rows && !cparams.kv_unified) {
+            LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
+            cparams.kv_unified = true;
+        }
+    }

    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;

@@ -112,6 +122,7 @@ llama_context::llama_context(
    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
    LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: kv_unified    = %s\n",   __func__, cparams.kv_unified ? "true" : "false");
    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);

@@ -267,7 +278,7 @@ llama_context::llama_context(

    // reserve worst-case graph
    if (!hparams.vocab_only && memory) {
-        const uint32_t n_seqs = cparams.n_seq_max;
+        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

        LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
@@ -300,7 +311,7 @@ llama_context::llama_context(

        // reserve with tg graph to get the number of splits and nodes
        {
-            auto * gf = graph_reserve(1, 1, 1, mctx.get());
+            auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
            if (!gf) {
                throw std::runtime_error("failed to allocate compute tg buffers");
            }
@@ -311,6 +322,10 @@ llama_context::llama_context(

        // reserve again with pp graph to avoid ggml-alloc reallocations during inference
        {
+            // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
+            //
+            // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
+            //
            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
            if (!gf) {
                throw std::runtime_error("failed to allocate compute pp buffers");
@@ -475,7 +490,7 @@ bool llama_context::kv_self_update(bool optimize) {
            throw std::runtime_error("failed to initialize memory context");
        }

-        const uint32_t n_seqs   = cparams.n_seq_max;
+        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

        auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@@ -735,13 +750,15 @@ int llama_context::encode(const llama_batch & batch_inp) {
    const int32_t n_vocab = model.vocab.n_tokens();

    // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
+    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }

    const uint32_t n_tokens = balloc->get_n_tokens();

+    // [TAG_NO_CACHE_PAD]
+    // TODO: add new split mode where we pad the input sequences so that ubatch.equal_seqs == true
    const llama_ubatch ubatch = balloc->split_simple(n_tokens);

    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
@@ -910,7 +927,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
    // when computing embeddings, all tokens are output
    const bool output_all = cparams.embeddings;

-    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }
@@ -2039,7 +2056,7 @@ void llama_context::opt_epoch_iter(
            batch.logits  [pos_batch]    = true;
        }

-        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {
+        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
            LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
            return;
        }
@@ -2198,6 +2215,7 @@ llama_context_params llama_context_default_params() {
        /*.no_perf                     =*/ true,
        /*.op_offload                  =*/ true,
        /*.swa_full                    =*/ true,
+        /*.kv_unified                  =*/ false,
    };

    return result;
@@ -11,8 +11,8 @@ struct llama_cparams {
    uint32_t n_batch;
    uint32_t n_ubatch;
    uint32_t n_seq_max;
-    int      n_threads;       // number of threads to use for generation
-    int      n_threads_batch; // number of threads to use for batch processing
+    int32_t  n_threads;       // number of threads to use for generation
+    int32_t  n_threads_batch; // number of threads to use for batch processing

    float rope_freq_base;
    float rope_freq_scale;
@@ -33,6 +33,7 @@ struct llama_cparams {
    bool no_perf;
    bool warmup;
    bool op_offload;
+    bool kv_unified;

    enum llama_pooling_type pooling_type;

@@ -982,13 +982,16 @@ ggml_tensor * llm_graph_context::build_attn_mha(
             float     kq_scale) const {
    const bool v_trans = v->nb[1] > v->nb[2];

+    // split the batch into streams if needed
+    const auto n_stream = k->ne[3];
+
+    q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
+
    q = ggml_permute(ctx0, q, 0, 2, 1, 3);
    k = ggml_permute(ctx0, k, 0, 2, 1, 3);
    v = ggml_permute(ctx0, v, 0, 2, 1, 3);

-    const auto n_tokens = q->ne[1];
-    const auto n_head   = q->ne[2];
-    const auto n_kv     = k->ne[1];
+    const auto n_kv = k->ne[1];

    ggml_tensor * cur;

@@ -1030,7 +1033,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
 #endif
        }

-        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
    } else {
        ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);

@@ -1075,7 +1078,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(

        cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);

-        cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
+        // recombine streams
+        cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);

        if (!cparams.offload_kqv) {
            // all nodes between the KV store and the attention output are run on the CPU
@@ -1122,6 +1126,10 @@ ggml_tensor * llm_graph_context::build_attn(

    const auto & kq_mask = inp->get_kq_mask();

+    // [TAG_NO_CACHE_PAD]
+    // TODO: if ubatch.equal_seqs == true, we can split the three tensors below into ubatch.n_seqs_unq streams
+    assert(ubatch.equal_seqs == false);
+
    ggml_tensor * q = q_cur;
    ggml_tensor * k = k_cur;
    ggml_tensor * v = v_cur;
@@ -1156,13 +1164,14 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
    {
        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");

-        const auto n_kv = mctx_cur->get_n_kv();
+        const auto n_kv     = mctx_cur->get_n_kv();
        const auto n_tokens = ubatch.n_tokens;
+        const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;

        inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
        ggml_set_input(inp->self_kq_mask);

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1362,13 +1371,15 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif

    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, mctx_cur);

+    const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
    {
        const auto n_kv = mctx_cur->get_base()->get_n_kv();

        inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
        ggml_set_input(inp->self_kq_mask);

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1382,7 +1393,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
        inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -255,10 +255,10 @@ public:
    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }

    ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
-    ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
+    ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]

-    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch, 1, 1]
-    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch, 1, 1]
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]

    const llama_hparams & hparams;
    const llama_cparams & cparams;
@@ -289,14 +289,14 @@ public:
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }

    ggml_tensor * self_k_idxs     = nullptr; // I64 [n_batch]
-    ggml_tensor * self_v_idxs     = nullptr; // I64 [n_batch]
+    ggml_tensor * self_v_idxs     = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
    ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
-    ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch]
+    ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]

-    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch, 1, 1]
-    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch, 1, 1]
-    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch, 1, 1]
-    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch, 1, 1]
+    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]

    const llama_hparams & hparams;
    const llama_cparams & cparams;
@@ -65,6 +65,46 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
    return n_embd_head_v * n_head_kv;
 }

+bool llama_hparams::is_n_embd_k_gqa_variable() const {
+    const uint32_t val = n_embd_k_gqa();
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        if (val != n_embd_k_gqa(il)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool llama_hparams::is_n_embd_v_gqa_variable() const {
+    const uint32_t val = n_embd_v_gqa();
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        if (val != n_embd_v_gqa(il)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+uint32_t llama_hparams::n_embd_k_gqa_max() const {
+    uint32_t val = n_embd_k_gqa();
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        val = std::max(val, n_embd_k_gqa(il));
+    }
+
+    return val;
+}
+
+uint32_t llama_hparams::n_embd_v_gqa_max() const {
+    uint32_t val = n_embd_v_gqa();
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        val = std::max(val, n_embd_v_gqa(il));
+    }
+
+    return val;
+}
+
 uint32_t llama_hparams::n_embd_r() const {
    if (wkv_head_size != 0) {
        // for RWKV models
@@ -6,7 +6,7 @@

 // bump if necessary
 #define LLAMA_MAX_LAYERS  512
-#define LLAMA_MAX_EXPERTS 256  // DeepSeekV3
+#define LLAMA_MAX_EXPERTS 384  // Kimi-K2

 enum llama_expert_gating_func_type {
    LLAMA_EXPERT_GATING_FUNC_TYPE_NONE    = 0,
@@ -191,6 +191,14 @@ struct llama_hparams {
    // dimension of value embeddings across all k-v heads
    uint32_t n_embd_v_gqa(uint32_t il = 0) const;

+    // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
+    bool is_n_embd_k_gqa_variable() const;
+    bool is_n_embd_v_gqa_variable() const;
+
+    // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
+    uint32_t n_embd_k_gqa_max() const;
+    uint32_t n_embd_v_gqa_max() const;
+
    // dimension of the rolling state embeddings
    // corresponds to Mamba's conv_states size or RWKV's token_shift states size
    uint32_t n_embd_r() const;
@@ -18,16 +18,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
                     bool   v_trans,
                     bool   offload,
                     bool   swa_full,
+                     bool   unified,
                 uint32_t   kv_size,
                 uint32_t   n_seq_max,
                 uint32_t   n_ubatch,
-                 uint32_t   n_pad) : hparams(model.hparams) {
+                 uint32_t   n_pad) : hparams(model.hparams), unified(unified) {
    llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
    llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };

    const uint32_t size_base = kv_size;

-    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));

    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
    if (swa_full) {
@@ -41,14 +42,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(

    kv_base = std::make_unique<llama_kv_cache_unified>(
            model, std::move(filter_base), type_k, type_v,
-            v_trans, offload, size_base, n_seq_max, n_pad,
+            v_trans, offload, unified, size_base, n_seq_max, n_pad,
            0, LLAMA_SWA_TYPE_NONE);

    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);

    kv_swa = std::make_unique<llama_kv_cache_unified>(
            model, std::move(filter_swa), type_k, type_v,
-            v_trans, offload, size_swa, n_seq_max, n_pad,
+            v_trans, offload, unified, size_swa, n_seq_max, n_pad,
            hparams.n_swa, hparams.swa_type);
 }

@@ -100,6 +101,11 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all

    // first try simple split
    do {
+        if (!unified) {
+            // requires equal splits, so we skip the simple split
+            break;
+        }
+
        balloc.split_reset();

        std::vector<llama_ubatch> ubatches;
@@ -140,7 +146,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all

        std::vector<llama_ubatch> ubatches;
        while (true) {
-            auto ubatch = balloc.split_equal(n_ubatch, false);
+            auto ubatch = balloc.split_equal(n_ubatch, !unified);

            if (ubatch.n_tokens == 0) {
                break;
@@ -20,6 +20,7 @@ public:
                         bool   v_trans,
                         bool   offload,
                         bool   swa_full,
+                         bool   unified,
                     uint32_t   kv_size,
                     uint32_t   n_seq_max,
                     uint32_t   n_ubatch,
@@ -68,6 +69,8 @@ public:
 private:
    const llama_hparams & hparams;

+    const bool unified;
+
    std::unique_ptr<llama_kv_cache_unified> kv_base;
    std::unique_ptr<llama_kv_cache_unified> kv_swa;
 };
@@ -35,16 +35,50 @@ public:
        std::vector<uint32_t> ids;
    };

+    struct stream_copy_info {
+        bool empty() const {
+            assert(ssrc.size() == sdst.size());
+            return ssrc.empty();
+        }
+
+        std::vector<uint32_t> ssrc;
+        std::vector<uint32_t> sdst;
+    };
+
    // for each ubatch, create a slot_info that contains information about where the ubatch should be inserted in the
    //   KV cells. for example, cell indices for each token, such that: token[i] -> goes to cells[idxs[i]]
    struct slot_info {
        // data for ggml_set_rows
        using idx_vec_t = std::vector<uint32_t>;

-        idx_vec_t idxs;
+        // number of streams: ns = s1 - s0 + 1
+        llama_seq_id s0;
+        llama_seq_id s1;
+
+        std::vector<llama_seq_id> strm; // [ns]
+        std::vector<idx_vec_t>    idxs; // [ns]

        uint32_t head() const {
-            return idxs.at(0);
+            GGML_ASSERT(idxs.size() == 1);
+            GGML_ASSERT(!idxs[0].empty());
+
+            return idxs[0][0];
+        }
+
+        void resize(size_t n) {
+            strm.resize(n);
+            idxs.resize(n);
+        }
+
+        size_t size() const {
+            GGML_ASSERT(idxs.size() == strm.size());
+            GGML_ASSERT(!idxs.empty());
+
+            return idxs[0].size();
+        }
+
+        size_t n_stream() const {
+            return strm.size();
        }

        bool empty() const {
@@ -54,9 +88,6 @@ public:
        void clear() {
            idxs.clear();
        }
-
-        // TODO: implement
-        //std::vector<idx_vec_t> seq_idxs;
    };

    using slot_info_vec_t = std::vector<slot_info>;
@@ -68,6 +99,7 @@ public:
                    ggml_type    type_v,
                         bool    v_trans,
                         bool    offload,
+                         bool    unified,
                     uint32_t    kv_size,
                     uint32_t    n_seq_max,
                     uint32_t    n_pad,
@@ -111,7 +143,8 @@ public:
    // llama_kv_cache_unified specific API
    //

-    uint32_t get_size() const;
+    uint32_t get_size()     const;
+    uint32_t get_n_stream() const;

    bool get_has_shift() const;

@@ -122,8 +155,8 @@ public:
    uint32_t get_n_kv() const;

    // get views of the current state of the cache
-    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
-    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;

    // store k_cur and v_cur in the cache based on the provided head location
    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
@@ -137,7 +170,7 @@ public:
    // return empty vector on failure
    slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);

-    bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
+    bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);

    // find a slot of kv cells that can hold the ubatch
    // if cont == true, then the slot must be continuous
@@ -157,8 +190,9 @@ public:
    void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
    void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;

+    void set_input_k_shift(ggml_tensor * dst) const;
+
    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
-    void set_input_k_shift   (ggml_tensor * dst) const;
    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;

 private:
@@ -172,15 +206,15 @@ private:

        ggml_tensor * k;
        ggml_tensor * v;
+
+        std::vector<ggml_tensor *> k_stream;
+        std::vector<ggml_tensor *> v_stream;
    };

    bool v_trans = true;  // the value tensor is transposed

-    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
-    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
-    uint32_t head = 0;
-
    const uint32_t n_seq_max = 1;
+    const uint32_t n_stream  = 1;

    // required padding
    const uint32_t n_pad = 1;
@@ -200,7 +234,17 @@ private:
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;

-    llama_kv_cells_unified cells;
+    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
+    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
+    std::vector<uint32_t> v_heads;
+
+    std::vector<llama_kv_cells_unified> v_cells;
+
+    // maps from a sequence id to a stream id
+    std::vector<uint32_t> seq_to_stream;
+
+    // pending stream copies that will be applied during the next update
+    stream_copy_info sc_info;

    std::vector<kv_layer> layers;

@@ -237,18 +281,25 @@ private:
                    ggml_cgraph * gf,
              const defrag_info & dinfo) const;

-    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
+    struct cell_ranges_t {
+        uint32_t strm;

-    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
+        std::vector<std::pair<uint32_t, uint32_t>> data; // ranges, from inclusive, to exclusive
+    };
+
+    void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
+
+    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
 };

 class llama_kv_cache_unified_context : public llama_memory_context_i {
 public:
    // some shorthands
-    using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
-    using defrag_info     = llama_kv_cache_unified::defrag_info;
+    using slot_info_vec_t  = llama_kv_cache_unified::slot_info_vec_t;
+    using defrag_info      = llama_kv_cache_unified::defrag_info;
+    using stream_copy_info = llama_kv_cache_unified::stream_copy_info;

    // used for errors
    llama_kv_cache_unified_context(llama_memory_status status);
@@ -262,7 +313,8 @@ public:
            llama_kv_cache_unified * kv,
            llama_context * lctx,
            bool do_shift,
-            defrag_info dinfo);
+            defrag_info dinfo,
+            stream_copy_info sc_info);

    // used to create a batch procesing context from a batch
    llama_kv_cache_unified_context(
@@ -320,6 +372,8 @@ private:

    defrag_info dinfo;

+    stream_copy_info sc_info;
+
    //
    // batch processing context
    //
@@ -40,6 +40,7 @@ llama_memory_hybrid::llama_memory_hybrid(
        offload,
        kv_size,
        n_seq_max,
+        1,
        n_pad,
        n_swa,
        swa_type
@@ -849,6 +849,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_DREAM:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                // Dream models are primarily 7B with 28 layers
+                switch (hparams.n_layer) {
+                    case 28:
+                        type = LLM_TYPE_7B;
+                        break;
+                    default:
+                        type = LLM_TYPE_UNKNOWN;
+                }
+                // Set non-causal attention for diffusion models
+                hparams.causal_attn = false;
+            }
+            break;
        case LLM_ARCH_QWEN2MOE:
            {
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
@@ -2670,6 +2685,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                } break;
            case LLM_ARCH_QWEN2:
            case LLM_ARCH_QWEN2VL:
+            case LLM_ARCH_DREAM:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

@@ -7756,6 +7772,109 @@ struct llm_build_qwen2 : public llm_graph_context {
    }
 };

+struct llm_build_dream : public llm_graph_context {
+    llm_build_dream(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) :
+        llm_graph_context(params) {
+        //copied from qwen2
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        auto * inp_attn = build_attn_inp_no_cache();
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                Qcur               = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                Kcur               = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                Vcur               = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
+                                 nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
+                            model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
 struct llm_build_qwen2vl : public llm_graph_context {
    llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -15763,6 +15882,7 @@ private:
        cb(zx, "mamba_in_proj", il);
        // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
        zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
+        zx = ggml_cont(ctx0, zx);
        zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
        cb(zx, "mamba_in_proj_out", il);

@@ -15780,7 +15900,6 @@ private:
        // conv1d
        {
            // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
-            x = ggml_view_2d(ctx0, x, d_inner, n_seq_tokens * n_seqs, d_inner * x->nb[0], 0);
            ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
            cb(conv_x, "mamba_conv1d_input", il);

@@ -16487,6 +16606,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_NEO_BERT:
        case LLM_ARCH_WAVTOKENIZER_DEC:
+        case LLM_ARCH_DREAM:
            {
                res = nullptr;
            } break;
@@ -16527,7 +16647,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                } else {
                    const auto padding = llama_kv_cache_unified::get_padding(cparams);

-                    cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
+                    uint32_t n_ctx_per_stream = cparams.n_ctx;
+
+                    if (!cparams.kv_unified) {
+                        n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
+                        n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
+
+                        cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
+                    } else {
+                        n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
+
+                        cparams.n_ctx = n_ctx_per_stream;
+                    }

                    LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);

@@ -16541,7 +16672,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                !cparams.flash_attn,
                                cparams.offload_kqv,
                                params.swa_full,
-                                cparams.n_ctx,
+                                cparams.kv_unified,
+                                n_ctx_per_stream,
                                cparams.n_seq_max,
                                cparams.n_ubatch,
                                padding);
@@ -16555,7 +16687,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                params.type_v,
                                !cparams.flash_attn,
                                cparams.offload_kqv,
-                                cparams.n_ctx,
+                                cparams.kv_unified,
+                                n_ctx_per_stream,
                                cparams.n_seq_max,
                                padding,
                                hparams.n_swa,
@@ -16638,6 +16771,11 @@ llm_graph_result_ptr llama_model::build_graph(
            {
                llm = std::make_unique<llm_build_qwen2>(*this, params, gf);
            } break;
+        case LLM_ARCH_DREAM:
+            {
+                llm = std::make_unique<llm_build_dream>(*this, params, gf);
+            }
+            break;
        case LLM_ARCH_QWEN2VL:
            {
                llm = std::make_unique<llm_build_qwen2vl>(*this, params, gf);
@@ -17055,6 +17193,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_BITNET:
        case LLM_ARCH_QWEN:
        case LLM_ARCH_QWEN2:
+        case LLM_ARCH_DREAM:
        case LLM_ARCH_QWEN2MOE:
        case LLM_ARCH_QWEN3:
        case LLM_ARCH_QWEN3MOE:
@@ -405,6 +405,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
+            case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
+                regex_exprs = {
+                    // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
+                    // The custom handler implements all K2 patterns with proper Han character exclusion
+                    "\\p{Han}+",
+                };
+                break;
            case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
                regex_exprs = {
                    "\\p{N}+",
@@ -1954,6 +1961,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "hunyuan") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "kimi-k2") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
+                clean_spaces = false;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@@ -3343,6 +3354,10 @@ llama_token llama_vocab::token_fim_sep() const {
    return pimpl->special_fim_sep_id;
 }

+llama_token llama_vocab::token_mask() const {
+    return pimpl->special_mask_id;
+}
+
 bool llama_vocab::get_add_space_prefix() const {
    return pimpl->add_space_prefix;
 }
@@ -3583,6 +3598,10 @@ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
    return vocab->token_fim_sep();
 }

+llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
+    return vocab->token_mask();
+}
+
 // deprecated
 const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
    return llama_vocab_get_text(vocab, token);
@@ -45,6 +45,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
    LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
    LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
+    LLAMA_VOCAB_PRE_TYPE_KIMI_K2        = 37,
 };

 struct LLM_KV;
@@ -100,6 +101,7 @@ struct llama_vocab {
    llama_token token_sep() const;
    llama_token token_nl () const;
    llama_token token_pad() const;
+    llama_token token_mask() const;

    llama_token token_prefix() const;
    llama_token token_middle() const;
@@ -557,6 +557,178 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
    return bpe_offsets;
 }

+// K2 system regex patterns (from tokenization_kimi.py):
+// [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
+static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets;
+    bpe_offsets.reserve(offsets.size());
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+
+    size_t start = 0;
+    for (auto offset : offsets) {
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;
+
+        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
+        };
+
+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
+        };
+
+        size_t _prev_end = offset_ini;
+        auto _add_token = [&] (const size_t end) -> size_t {
+            assert(_prev_end <= end && end <= offset_end);
+            size_t len = end - _prev_end;
+            if (len > 0) {
+                bpe_offsets.push_back(len);
+            }
+            _prev_end = end;
+            return len;
+        };
+
+        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+            const uint32_t cpt = _get_cpt(pos);
+            const auto flags = _get_flags(pos);
+
+            // Pattern 1: [\p{Han}]+ (Chinese characters)
+            if (unicode_cpt_is_han(cpt)) {
+                while (unicode_cpt_is_han(_get_cpt(pos))) {
+                    pos++;
+                }
+                _add_token(pos);
+                continue;
+            }
+
+            // Pattern 2 & 3: Letter words excluding Han characters with optional contractions
+            // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)?
+            // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
+            // Check if current char is a letter OR if current char could be a leading char and next char is a letter
+            bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) ||
+                                     (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) &&
+                                      _get_flags(pos + 1).is_letter && !unicode_cpt_is_han(_get_cpt(pos + 1)));
+
+            if (is_letter_pattern) {
+                // Handle optional leading non-letter/non-number character
+                bool has_leading_char = false;
+                if (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number)) {
+                    has_leading_char = true;
+                    pos++;
+                }
+
+                // Match letter sequence (excluding Han characters)
+                bool has_letters = false;
+                while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
+                    has_letters = true;
+                    pos++;
+                }
+
+                // Only proceed if we found letters (after potentially skipping leading char)
+                if (has_letters || (!has_leading_char && _get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos)))) {
+                    if (!has_letters) pos++; // consume the first letter if we didn't already
+
+                    // Continue consuming letters
+                    while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
+                        pos++;
+                    }
+
+                    // Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d)
+                    if (_get_cpt(pos) == '\'' && pos + 1 < offset_end) {
+                        uint32_t cpt_next = unicode_tolower(_get_cpt(pos + 1));
+                        if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+                            pos += 2;
+                        } else if (pos + 2 < offset_end) {
+                            uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos + 2));
+                            if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+                                (cpt_next == 'v' && cpt_next_next == 'e') ||
+                                (cpt_next == 'l' && cpt_next_next == 'l')) {
+                                pos += 3;
+                            }
+                        }
+                    }
+
+                    _add_token(pos);
+                    continue;
+                } else if (has_leading_char) {
+                    // We consumed a leading char but found no letters, backtrack
+                    pos--;
+                }
+            }
+
+            // Pattern 4: \p{N}{1,3} (numbers 1-3 digits)
+            if (flags.is_number) {
+                size_t ini = pos;
+                while (_get_flags(pos).is_number) {
+                    if (++pos - ini >= 3) {
+                        _add_token(pos);
+                        ini = pos;
+                    }
+                }
+                _add_token(pos);
+                continue;
+            }
+
+            // Pattern 5:  ?[^\s\p{L}\p{N}]+[\r\n]* (optional space + non-word chars + optional newlines)
+            auto flags2 = (cpt == ' ' ? _get_flags(pos + 1) : flags);
+            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
+                pos += (cpt == ' ');
+                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
+                    flags2 = _get_flags(++pos);
+                }
+                // Match optional [\r\n]*
+                uint32_t cpt2 = _get_cpt(pos);
+                while (cpt2 == '\r' || cpt2 == '\n') {
+                    cpt2 = _get_cpt(++pos);
+                }
+                _add_token(pos);
+                continue;
+            }
+
+            // Count whitespace characters
+            size_t num_whitespaces = 0;
+            size_t last_end_r_or_n = 0;
+            while (_get_flags(pos + num_whitespaces).is_whitespace) {
+                uint32_t cpt2 = _get_cpt(pos + num_whitespaces);
+                if (cpt2 == '\r' || cpt2 == '\n') {
+                    last_end_r_or_n = pos + num_whitespaces + 1;
+                }
+                num_whitespaces++;
+            }
+
+            // Pattern 6: \s*[\r\n]+ (whitespace with newlines)
+            if (last_end_r_or_n > 0) {
+                pos = last_end_r_or_n;
+                _add_token(pos);
+                continue;
+            }
+
+            // Pattern 7: \s+(?!\S) (trailing whitespace)
+            if (num_whitespaces > 1 && _get_cpt(pos + num_whitespaces) != OUT_OF_RANGE) {
+                pos += num_whitespaces - 1;
+                _add_token(pos);
+                continue;
+            }
+
+            // Pattern 8: \s+ (general whitespace)
+            if (num_whitespaces > 0) {
+                pos += num_whitespaces;
+                _add_token(pos);
+                continue;
+            }
+
+            // No matches - consume single character
+            _add_token(++pos);
+        }
+    }
+
+    return bpe_offsets;
+}
+
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;

@@ -567,6 +739,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
            regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {

        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
+    } else if (regex_expr == "\\p{Han}+") {
+        // K2's first pattern - handle all K2 patterns together
+        bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
    }

    return bpe_offsets;
@@ -672,6 +847,38 @@ uint32_t unicode_tolower(uint32_t cpt) {
    return cpt;  // Return the original code point if no lowercase mapping is found
 }

+bool unicode_cpt_is_han(uint32_t cpt) {
+    // Han character ranges (Chinese/CJK characters)
+    // CJK Unified Ideographs (most common)
+    if (cpt >= 0x4E00 && cpt <= 0x9FFF) return true;
+
+    // CJK Extension A
+    if (cpt >= 0x3400 && cpt <= 0x4DBF) return true;
+
+    // CJK Extension B
+    if (cpt >= 0x20000 && cpt <= 0x2A6DF) return true;
+
+    // CJK Extension C
+    if (cpt >= 0x2A700 && cpt <= 0x2B73F) return true;
+
+    // CJK Extension D
+    if (cpt >= 0x2B740 && cpt <= 0x2B81F) return true;
+
+    // CJK Extension E
+    if (cpt >= 0x2B820 && cpt <= 0x2CEAF) return true;
+
+    // CJK Extension F
+    if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF) return true;
+
+    // CJK Compatibility Ideographs
+    if (cpt >= 0xF900 && cpt <= 0xFAFF) return true;
+
+    // CJK Compatibility Ideographs Supplement
+    if (cpt >= 0x2F800 && cpt <= 0x2FA1F) return true;
+
+    return false;
+}
+
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
    // unicode categories
    static const std::map<std::string, int> k_ucat_enum = {
@@ -63,4 +63,6 @@ uint8_t     unicode_utf8_to_byte(const std::string & utf8);

 uint32_t unicode_tolower(uint32_t cpt);

+bool unicode_cpt_is_han(uint32_t cpt);
+
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
@@ -4282,7 +4282,7 @@ struct test_flash_attn_ext : public test_case {

        ggml_tensor * m = nullptr;
        if (mask) {
-            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), nr23[0], nr23[1]);
+            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, nr23[1]);
            ggml_set_name(m, "m");
        }

@@ -127,10 +127,9 @@ int main(int argc, char ** argv) {

                for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
                    for (int i = 0; i < pp; ++i) {
-                        common_batch_add(batch, 0, i, { j }, false);
+                        common_batch_add(batch, 0, i, { j }, i == pp - 1);
                    }
                }
-                batch.logits[batch.n_tokens - 1] = true;

                const auto t_pp_start = ggml_time_us();

@@ -7,7 +7,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 **Features:**
 * LLM inference of F16 and quantized models on GPU and CPU
 * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
- * Reranking endoint (https://github.com/ggml-org/llama.cpp/pull/9510)
+ * Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510)
 * Parallel decoding with multi-user support
 * Continuous batching
 * Multimodal ([documentation](../../docs/multimodal.md)) / with OpenAI-compatible API support
@@ -127,7 +127,6 @@ struct slot_params {
    std::vector<std::string> response_fields;
    bool timings_per_token = false;
    bool post_sampling_probs = false;
-    bool ignore_eos = false;

    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
@@ -441,7 +440,6 @@ struct server_task {

        {
            params.sampling.logit_bias.clear();
-            params.ignore_eos = json_value(data, "ignore_eos", false);

            const auto & logit_bias = data.find("logit_bias");
            if (logit_bias != data.end() && logit_bias->is_array()) {
@@ -472,6 +470,13 @@ struct server_task {
                    }
                }
            }
+
+            params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
+            if (params.sampling.ignore_eos) {
+                params.sampling.logit_bias.insert(
+                        params.sampling.logit_bias.end(),
+                        defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
+            }
        }

        {
@@ -1898,7 +1903,6 @@ struct server_context {

    bool clean_kv_cache = true;
    bool add_bos_token  = true;
-    bool has_eos_token  = false;

    int32_t n_ctx; // total context for all clients / slots

@@ -1957,7 +1961,6 @@ struct server_context {
        n_ctx = llama_n_ctx(ctx);

        add_bos_token = llama_vocab_get_add_bos(vocab);
-        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;

        if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
@@ -2217,10 +2220,6 @@ struct server_context {
            slot.params.n_predict = slot.n_predict;
        }

-        if (slot.params.ignore_eos && has_eos_token) {
-            slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
-        }
-
        {
            if (slot.smpl != nullptr) {
                common_sampler_free(slot.smpl);
Author	SHA1	Message	Date
Georgi Gerganov	225e7a1438	llama : add high-throughput mode (#14363 ) * kv-cache : prepare K/V buffers for separation ggml-ci * batched-bench : fix oob write ggml-ci * llama : add "virtual sequences" ggml-ci * llama : use "stream" vs "virtual sequence" ggml-ci * graph : fix stream splitting when KV cache is not used ggml-ci * kv-cache : add multi-stream save/load support ggml-ci * llama : add "--attn-streams" flag ggml-ci * kv-cache : fix handling when find_slot fails ggml-ci * kv-cache : restore find_slot impl ggml-ci * kv-cache : add comments * kv-cache : add bounds checks for sequence id ggml-ci * cont : add n_seq_max to batch allocr ggml-ci * kv-cache : perform stream copies lazily after llama_synchronize ggml-ci * kv-cache : avoid throwing exceptions across the C boundary ggml-ci * CUDA: 4D FlashAttention support (#14628) * CUDA: 4D FlashAttention support * CUDA: fix WMMA FA kernel * llama : rename attn_streams -> kv_unified ggml-ci * common : rename kv_split -> kv_unified ggml-ci --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>	2025-07-16 16:35:42 +03:00
Aman Gupta	ab14019821	Support diffusion models: Add Dream 7B (#14644 ) * Support diffusion models: Add Dream 7B * Move diffusion to examples * Move stuff to examples. Add patch to not use kv-cache * Address review comments * Make sampling fast * llama: remove diffusion functions * Add basic timings + cleanup * More cleanup * Review comments: better formating, use LOG instead std::cerr, re-use batch, use ubatch instead of max_length * fixup! * Review: move everything to diffusion-cli for now	2025-07-16 20:03:51 +08:00
Georgi Gerganov	64978340b0	ggml : add asserts (#14720 ) * ggml : add asserts ggml-ci * cont : fix constant type Co-authored-by: Diego Devesa <slarengh@gmail.com> --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>	2025-07-16 14:43:32 +03:00
Georgi Gerganov	6ffd4e9c44	server : pre-calculate EOG logit biases (#14721 ) ggml-ci	2025-07-16 14:04:12 +03:00
Shunta Saito	e4841d24d3	llama : fix parallel processing for plamo2 (#14716 )	2025-07-16 12:12:22 +02:00
Georgi Gerganov	538cc77f7f	server : fix handling of the ignore_eos flag (#14710 ) ggml-ci	2025-07-16 12:13:57 +03:00
Johannes Gäßler	5cae766541	scripts: synthetic prompt mode for server-bench.py (#14695 )	2025-07-16 09:33:28 +02:00
Sigbjørn Skjæret	4b91d6f71f	convert : only check for tokenizer folder if we need it (#14704 )	2025-07-16 08:52:04 +02:00
Sigbjørn Skjæret	cf91f217f1	convert : add pre-computed hashes first to prevent order mishaps (#14701 )	2025-07-16 08:51:12 +02:00
Min-Hua	79e0b68c17	llama: add LLAMA_API to deprecated llama_kv_self_seq_div (#14708 ) Add LLAMA_API to fix the run-time error with llama-cpp-python in Windows env: attributeError: function 'llama_kv_self_seq_div' not found. Did you mean: 'llama_kv_self_seq_add'? Although llama_kv_self_seq_div() has been marked deprecated but it is necessary to export it to make llama-cpp-python happy. Observed software version: OS: windows compiler: MSVC llama-cpp-python: tag: v0.3.12-cu124 llama.cpp: tag: b5833 Signed-off-by: Min-Hua Chen <minhuadotchen@gmail.com> Co-authored-by: Min-Hua Chen <minhua.chen@neuchips.ai>	2025-07-16 07:00:42 +03:00
Ed Addario	c81f4192f9	gguf-py : dump bpw per layer and model in markdown mode (#14703 )	2025-07-16 00:04:42 +02:00
Gabriel Larson	4a4f426944	model : add Kimi-K2 support (#14654 ) * Kimi-K2 conversion * add Kimi_K2 pre type * Kimi-K2 * Kimi-K2 unicode * Kimi-K2 * LLAMA_MAX_EXPERTS 384 * fix vocab iteration * regex space fix * add kimi-k2 to pre_computed_hashes * Updated with kimi-k2 get_vocab_base_pre hash * fix whitespaces * fix flake errors * remove more unicode.cpp whitespaces * change set_vocab() flow * add moonshotai-Kimi-K2.jinja to /models/templates/ * update moonshotai-Kimi-K2.jinja * add kimi-k2 chat template * add kimi-k2 * update NotImplementedError Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * except Exception Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * LLM_CHAT_TEMPLATE_KIMI_K2 if(add_ass){} --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-07-15 21:54:22 +02:00
Jeff Bolz	ba1ceb3456	vulkan: fix noncontig check for mat_mul_id splitting (#14683 ) * vulkan: fix noncontig check for mat_mul_id splitting Remove supports_op check for > 4096 (splitting fixes this) * vulkan: fix batched matmul dequant for Q*_K	2025-07-15 21:51:09 +02:00