CUDA: fix overflow in MMA kernel without stream-k (#17939 )

models : fix the attn_factor for mistral3 graphs + improve consistency (#17945 )
* models : fix the attn_factor for mistral3 graphs * cont : rework attn_factor correction logic * cont : make deepseek2 consistent * cont : add TODO * cont : special-case DSv2 * cont : revert Mistral 3 Large changes * cont : fix DS2 to use the original attn_factor * cont : minor comments
2025-12-12 17:43:58 +01:00 · 2025-12-12 17:12:40 +02:00 · 2025-12-12 15:49:27 +01:00 · 2025-12-12 16:26:03 +02:00 · 2025-12-12 15:16:06 +01:00 · 2025-12-12 15:13:36 +01:00
60 changed files with 997 additions and 495 deletions
@@ -1770,7 +1770,7 @@ jobs:
          echo "Fetch llama2c model"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

  ubuntu-cmake-sanitizer-riscv64-native:
    runs-on: RISCV64
@@ -1856,7 +1856,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
    add_opt(common_arg(
-        {"--mmproj"}, "FILE",
+        {"-mm", "--mmproj"}, "FILE",
        "path to a multimodal projector file. see tools/mtmd/README.md\n"
        "note: if -hf is used, this argument can be omitted",
        [](common_params & params, const std::string & value) {
@@ -1864,7 +1864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
    add_opt(common_arg(
-        {"--mmproj-url"}, "URL",
+        {"-mmu", "--mmproj-url"}, "URL",
        "URL to a multimodal projector file. see tools/mtmd/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.url = value;
@@ -12,6 +12,8 @@
 #include <filesystem>
 #include <fstream>
 #include <future>
+#include <map>
+#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &

 #elif defined(LLAMA_USE_HTTPLIB)

-static bool is_output_a_tty() {
+class ProgressBar {
+    static inline std::mutex mutex;
+    static inline std::map<const ProgressBar *, int> lines;
+    static inline int max_line = 0;
+
+    static void cleanup(const ProgressBar * line) {
+        lines.erase(line);
+        if (lines.empty()) {
+            max_line = 0;
+        }
+    }
+
+    static bool is_output_a_tty() {
 #if defined(_WIN32)
-    return _isatty(_fileno(stdout));
+        return _isatty(_fileno(stdout));
 #else
-    return isatty(1);
+        return isatty(1);
 #endif
-}
-
-static void print_progress(size_t current, size_t total) {
-    if (!is_output_a_tty()) {
-        return;
    }

-    if (!total) {
-        return;
+public:
+    ProgressBar() = default;
+
+    ~ProgressBar() {
+        std::lock_guard<std::mutex> lock(mutex);
+        cleanup(this);
    }

-    size_t width = 50;
-    size_t pct = (100 * current) / total;
-    size_t pos = (width * current) / total;
+    void update(size_t current, size_t total) {
+        if (!is_output_a_tty()) {
+            return;
+        }

-    std::cout << "["
-              << std::string(pos, '=')
-              << (pos < width ? ">" : "")
-              << std::string(width - pos, ' ')
-              << "] " << std::setw(3) << pct << "%  ("
-              << current / (1024 * 1024) << " MB / "
-              << total / (1024 * 1024) << " MB)\r";
-    std::cout.flush();
-}
+        if (!total) {
+            return;
+        }
+
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (lines.find(this) == lines.end()) {
+            lines[this] = max_line++;
+            std::cout << "\n";
+        }
+        int lines_up = max_line - lines[this];
+
+        size_t width = 50;
+        size_t pct = (100 * current) / total;
+        size_t pos = (width * current) / total;
+
+        std::cout << "\033[s";
+
+        if (lines_up > 0) {
+            std::cout << "\033[" << lines_up << "A";
+        }
+        std::cout << "\033[2K\r["
+            << std::string(pos, '=')
+            << (pos < width ? ">" : "")
+            << std::string(width - pos, ' ')
+            << "] " << std::setw(3) << pct << "%  ("
+            << current / (1024 * 1024) << " MB / "
+            << total / (1024 * 1024) << " MB) "
+            << "\033[u";
+
+        std::cout.flush();
+
+        if (current == total) {
+             cleanup(this);
+        }
+    }
+
+    ProgressBar(const ProgressBar &) = delete;
+    ProgressBar & operator=(const ProgressBar &) = delete;
+};

 static bool common_pull_file(httplib::Client & cli,
                             const std::string & resolve_path,
@@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
    const char * func = __func__; // avoid __func__ inside a lambda
    size_t downloaded = existing_size;
    size_t progress_step = 0;
+    ProgressBar bar;

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
@@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
            progress_step += len;

            if (progress_step >= total_size / 1000 || downloaded == total_size) {
-                print_progress(downloaded, total_size);
+                bar.update(downloaded, total_size);
                progress_step = 0;
            }
            return true;
@@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
        nullptr
    );

-    std::cout << "\n";
-
    if (!res) {
        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
        return false;
@@ -7286,6 +7286,10 @@ class DeepseekV2Model(TextModel):
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+
+            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
+            # ref https://github.com/ggml-org/llama.cpp/pull/17945
            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])

    _experts: list[dict[str, Tensor]] | None = None
@@ -10041,6 +10045,10 @@ class MistralMoeModel(DeepseekV2Model):
        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
        yarn_params = self.hparams["yarn"]
        self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
+
+        # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+        # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
+        # ref https://github.com/ggml-org/llama.cpp/pull/17945
        self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
@@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
 or with a server image:

 ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
 ```

 ## Docker With CUDA
@@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```

 ## Docker With MUSA
@@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
 ```bash
 docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
@@ -32,10 +32,6 @@ def quick_logits_check(pytorch_file, llamacpp_file):
    print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
    print(f"Max absolute difference: {max_diff:.4f}")

-    if max_diff > 1.0:
-        print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
-        return False
-
    return True

 def main():
@@ -99,6 +99,7 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
    // other
    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_get_rvv_vlen   (void);  // risc-v vector length in bytes
    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
@@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
+static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
    size = aligned_offset(NULL, size, alloc->alignment);

-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
-        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, addr, tensor);
-#endif
-
    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];

    // see if we can merge with an existing block
@@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
    }
    // otherwise, add a new block
    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
-
-    GGML_UNUSED(tensor);
 }

 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
@@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten

    GGML_ASSERT(parent_size >= node_size);

+    // note: we want after the freeing the chunks to continue to be aligned
+    struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
+    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
+
    if (parent_size > node_size) {
-        struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
        struct buffer_address p_addr = p_hn->addr;
        p_addr.offset += node_size;
        size_t extra_size = parent_size - node_size;
        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
-        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+        ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
    }
 }

@@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
+
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, hn->addr, node);
+#endif
+
+    ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
    hn->allocated = false;
 }

@@ -2548,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
        case GGML_OP_GROUP_NORM:
+            return true;
        case GGML_OP_PAD:
            // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
            return ggml_get_op_params_i32(op, 8) == 0;
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
 } ggml_arm_arch_features = { 0 };
 #endif

+#if defined(__riscv)
+struct ggml_riscv_arch_features_type {
+    int rvv_vlen;
+} ggml_riscv_arch_features = { 0 };
+#endif

 #if defined(_WIN32)

@@ -703,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
 #endif
 #endif // __ARM_ARCH

+#if defined(__riscv) && defined(__riscv_v_intrinsic)
+#include <riscv_vector.h>
+static void ggml_init_riscv_arch_features(void) {
+    ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
+}
+#else
+static void ggml_init_riscv_arch_features(void) {}
+#endif
+
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
    GGML_ASSERT(!ggml_get_no_alloc(ctx));

@@ -3459,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
 #endif
 }

+int ggml_cpu_get_rvv_vlen(void) {
+#if defined(__riscv) && defined(__riscv_v_intrinsic)
+    return ggml_riscv_arch_features.rvv_vlen;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_f16c(void) {
 #if defined(__F16C__)
    return 1;
@@ -3625,6 +3647,10 @@ void ggml_cpu_init(void) {
        ggml_init_arm_arch_features();
 #endif

+#if defined(__riscv)
+        ggml_init_riscv_arch_features();
+#endif
+
        is_first_call = false;
    }

@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_riscv_v()) {
            features.push_back({ "RISCV_V", "1" });
        }
+        if (ggml_cpu_get_rvv_vlen() > 0) {
+            static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
+            features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
+        }
        if (ggml_cpu_has_vsx()) {
            features.push_back({ "VSX", "1" });
        }
@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;

    if (cur->type == GGML_TYPE_Q4_0) {
-        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
+            || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
            if (cur->ne[1] % 8 == 0) {
                return &q4_0_8x8_q8_0;
            }
@@ -67,19 +67,22 @@
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA3_5    (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
 #define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000

-#define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
-#define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
-#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
-#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
-#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
-#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_AMD(cc)     (cc >= GGML_CUDA_CC_OFFSET_AMD)
+#define GGML_CUDA_CC_IS_RDNA(cc)    (cc >= GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_RDNA1(cc)   (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
+#define GGML_CUDA_CC_IS_RDNA2(cc)   (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
+#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA3(cc)   (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
+#define GGML_CUDA_CC_IS_RDNA4(cc)   (cc >= GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_GCN(cc)     (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
+#define GGML_CUDA_CC_IS_CDNA(cc)    (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_CDNA1(cc)   (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
+#define GGML_CUDA_CC_IS_CDNA2(cc)   (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
+#define GGML_CUDA_CC_IS_CDNA3(cc)   (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)

 // Moore Threads
 #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
@@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
    const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
    const int iter_j = (ne01 + (ncols1    - 1)) / ncols1;

-    const int kbc0      = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0      = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;

    const bool did_not_have_any_data   = kbc0 == kbc0_stop;
    const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
    int bidx = bidx0 - 1;
    int kbc_stop = kbc0;
    while(true) {
-        const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+        const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
        if (kbc == kbc_stop) { // Did not have any data.
            bidx--;
            kbc_stop = kbc;
@@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
    const int iter_j = (ne01.z + (ncols1    - 1)) / ncols1;

    // kbc == k block continuous, current index in continuous ijk space.
-    int       kbc      = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    int       kbc      = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;

    // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
    // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
@@ -1401,7 +1401,7 @@ static __global__ void flash_attn_ext_f16(
        const float2 * Q_f2   = (const float2 *) (Q + nb03*sequence + nb02* head0);
        const half2  * K_h2   = (const half2  *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
        const half   * mask_h = ncols2 == 1 && !mask ? nullptr :
-            (const half  *) (mask + nb33*(sequence % ne33));
+            (const half *) (mask + nb33*(sequence % ne33));
        float2       * dstk   = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);

        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
@@ -4630,9 +4630,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CUMSUM:
        case GGML_OP_TRI:
        case GGML_OP_DIAG:
-            return true;
        case GGML_OP_SOLVE_TRI:
-            return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
+            return true;
+
        default:
            return false;
    }
@@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
                return 8 * (threadIdx.x / 16) + l;
 #elif defined(RDNA3)
                return 2 * l + (threadIdx.x / 16);
+#else
+                NO_DEVICE_CODE;
+                return -1;
 #endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
@@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE)
-
+#if defined(RDNA3)
+        // RDNA3 has duplicated data as input.
+        static constexpr int ne = I * J / 32 * 2;
+#else
        static constexpr int ne = I * J / 32;
+#endif // defined(RDNA3)
        half2 x[ne] = {{0.0f, 0.0f}};

        static constexpr __device__ bool supported() {
@@ -310,7 +317,14 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
+#if defined(RDNA4)
                return 4 * (threadIdx.x / 16) + l;
+#elif defined(RDNA3)
+                return l;
+#else
+                NO_DEVICE_CODE;
+                return -1;
+#endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
        static constexpr int         I  = I_;
        static constexpr int         J  = J_;
        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
-        static constexpr int         ne = I * J / WARP_SIZE;
-
-        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};

 #if defined(AMD_WMMA_AVAILABLE)
+#if defined(RDNA3)
+        // RDNA3 has duplicated data as input.
+        static constexpr int ne = I * J / 32 * 2;
+#else
+        static constexpr int ne = I * J / 32;
+#endif // defined(RDNA3)
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
        static constexpr __device__ bool supported() {
            if (I == 16 && J == 8) return true;
            return false;
@@ -387,13 +406,23 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
+#if defined(RDNA4)
                return 4 * (threadIdx.x / 16) + l;
+#elif defined(RDNA3)
+                return l;
+#else
+                NO_DEVICE_CODE;
+                return -1;
+#endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
                return -1;
            }
        }
 #else
+        static constexpr int ne = I * J / WARP_SIZE;
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
        static constexpr __device__ bool supported() {
            if (I ==  8 && J ==  8) return true;
            if (I == 16 && J ==  4) return true;
@@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
        }
 #elif defined(AMD_WMMA_AVAILABLE)
        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-
+#if defined(RDNA4)
+                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+#elif defined(RDNA3)
+                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
+#else
+                NO_DEVICE_CODE;
+#endif // defined(RDNA4)
        } else if constexpr (std::is_same_v<T, int>) {
            if constexpr (I == 16 && J == 4) {
                int64_t * xi = (int64_t *) t.x;
@@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
        const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
        const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
+#elif defined(RDNA3)
+        using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
+        using floatx8_t = __attribute__((ext_vector_type(8))) float;
+        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+        const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
+        const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
+        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
        GGML_UNUSED_VARS(D, A, B);
@@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
        const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
        const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
+#elif defined(RDNA3)
+        using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
+        using floatx8_t = __attribute__((ext_vector_type(8))) float;
+        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+        const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
+        const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
+        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
        GGML_UNUSED_VARS(D, A, B);
@@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
            return false;
        }
    } else {
-        if (src1_ncols > 16) {
+        if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
+            return false;
+        } else if (src1_ncols > 16) {
            return false;
        }
    }
@@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
        case GGML_TYPE_F32:
            return ampere_mma_available(cc);
        case GGML_TYPE_F16:
-            return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
+            return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
        case GGML_TYPE_BF16:
-            return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
+            return ampere_mma_available(cc) || amd_wmma_available(cc);
        default:
            return false;
    }
@@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                return ne11 <= 8;
            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                if (fp16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc)) {
+                        return ne11 <= 3;
+                    }
+                    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
                        return ne11 <= 5;
                    }
                    return ne11 <= 2;
@@ -3,6 +3,80 @@
 #include "solve_tri.cuh"

 #define MAX_N_FAST 64
+#define MAX_K_FAST 32
+
+static __global__ void get_batch_pointers(const float *  A,
+                                          float *        X,
+                                          const float ** A_ptrs,
+                                          float **       X_ptrs,
+                                          int64_t        ne02,
+                                          int64_t        total_batches,
+                                          size_t         s02,
+                                          size_t         s03,
+                                          size_t         s2,
+                                          size_t         s3) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_batches) {
+        return;
+    }
+
+    const int64_t i3 = idx / ne02;
+    const int64_t i2 = idx % ne02;
+
+    A_ptrs[idx] = A + i3 * s03 + i2 * s02;
+    X_ptrs[idx] = X + i3 * s3 + i2 * s2;
+}
+
+static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
+                                 const float *               A,
+                                 const float *               B,
+                                 float *                     X,
+                                 int                         n,
+                                 int                         k,
+                                 int64_t                     ne02,
+                                 int64_t                     ne03,
+                                 size_t                      s02,
+                                 size_t                      s03,
+                                 size_t                      s12,
+                                 size_t                      s13,
+                                 size_t                      s2,
+                                 size_t                      s3,
+                                 cudaStream_t                stream) {
+    const float   alpha         = 1.0f;
+    const int64_t total_batches = ne02 * ne03;
+    if (total_batches == 0) {
+        return;
+    }
+
+    // Bulk copy B -> X (contiguous tensors)
+    if (X != B) {
+        const int64_t total_elements_BX = n * k * total_batches;
+        CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+    }
+
+    const int id = ggml_cuda_get_device();
+
+    ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
+    ggml_cuda_pool_alloc<float *>       X_ptrs_alloc(ctx.pool(id), total_batches);
+
+    const float ** A_ptrs_dev = A_ptrs_alloc.get();
+    float **       X_ptrs_dev = X_ptrs_alloc.get();
+
+    get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
+                                                                        total_batches, s02, s03, s2, s3);
+
+    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+
+    // Yes, this is necessary, without this we get RMSE errors
+    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
+    CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                                    CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
+
+    // revert to standard mode from common.cuh
+    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
+
+    GGML_UNUSED_VARS(s12, s13);
+}

 // ======================
 // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
@@ -63,7 +137,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
    float x_low  = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
    float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;

-    const int half = WARP_SIZE;
+    const int half      = WARP_SIZE;
    const int nrows_low = (n < half) ? n : half;

 #pragma unroll
@@ -81,8 +155,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,

 #pragma unroll
    for (int row = half; row < n; ++row) {
-        float sum = sA[row * n + lane] * x_low;
-        const int j = half + lane;
+        float     sum = sA[row * n + lane] * x_low;
+        const int j   = half + lane;
        if (j < row) {
            sum += sA[row * n + j] * x_high;
        }
@@ -97,7 +171,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
    for (int rr = 0; rr < 2; ++rr) {
        const int row = rr * WARP_SIZE + lane;
        if (row < n) {
-            const float val = (row < half) ? x_low : x_high;
+            const float val            = (row < half) ? x_low : x_high;
            X_batch[row * k + col_idx] = val;
        }
    }
@@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
 }

 void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];  // A (triangular n x x matrix)
-    const ggml_tensor * src1 = dst->src[1];  // B (right hand side of n x k equation columns)
+    const ggml_tensor * src0 = dst->src[0];  // A (n×n, lower triangular)
+    const ggml_tensor * src1 = dst->src[1];  // B (n×k)

    ggml_is_contiguous(src0);
    ggml_is_contiguous(src1);

-    const int64_t n = src0->ne[0];
-    const int64_t k = src1->ne[0];
+    const int64_t n    = src0->ne[0];
+    const int64_t k    = src1->ne[0];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];

-    GGML_ASSERT(n <= 64);
-    GGML_ASSERT(k <= 32);
-
-    solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
-                       src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
-                       src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
-                       dst->nb[3] / sizeof(float), ctx.stream());
+    if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
+        solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
+                           src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+                           src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
+                           dst->nb[3] / sizeof(float), ctx.stream());
+    } else {
+        solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
+                             ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+                             src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
+                             dst->nb[3] / sizeof(float), ctx.stream());
+    }
 }
@@ -19,6 +19,9 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_16BF HIPBLAS_R_16B
 #define CUDA_R_32F  HIPBLAS_R_32F
+#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
 #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
@@ -30,6 +33,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define __all_sync(mask, var) __all(var)
 #define __any_sync(mask, var) __any(var)
+#define cublasStrsmBatched hipblasStrsmBatched
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
@@ -12,11 +12,16 @@
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
+#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
+#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
+#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_16BF MUSA_R_16BF
 #define CUDA_R_32F  MUSA_R_32F
+#define cublasStrsmBatched mublasStrsmBatched
 #define cublasComputeType_t cudaDataType_t
 #define cublasCreate mublasCreate
 #define cublasDestroy mublasDestroy
@@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
    return (1 - MIN(1, MAX(0, y)));
 }

-static void rope_cache_init(const float   theta_base,
-                            float         freq_scale,
-                            const float * freq_factors,
-                            float *       corr_dims,
-                            uint32_t      ne0,
-                            float         ext_factor,
-                            float         mscale,
-                            float *       cache,
-                            float         theta_scale) {
+static void rope_cache_init(const float    theta_base,
+                            const float    freq_scale,
+                            const float *  freq_factors,
+                            float *        corr_dims,
+                            const uint32_t ne0,
+                            const float    ext_factor,
+                            const float    mscale,
+                            float *        cache,
+                            const float    theta_scale) {
    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
    float theta = theta_base;

@@ -92,18 +92,19 @@ static void rope_cache_init(const float   theta_base,

        // Get n-d rotational scaling corrected for extrapolation
        float theta_interp = freq_scale * theta_extrap;
-        float theta2       = theta_interp;
+        float theta_final  = theta_interp;
+        float mscale_final = mscale;

        if (ext_factor != 0.0f) {
            float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-            theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            theta_final    = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;

            // Get n-d magnitude scaling corrected for interpolation
-            mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+            mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
        }

-        cache[i0 + 0] = cosf(theta2) * mscale;
-        cache[i0 + 1] = sinf(theta2) * mscale;
+        cache[i0 + 0] = cosf(theta_final) * mscale_final;
+        cache[i0 + 1] = sinf(theta_final) * mscale_final;

        theta *= theta_scale;
    }
@@ -151,9 +152,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
 }

 static void hvx_calc_rope_neox_f32(const float * restrict src0,
-                              float * restrict dst,
-                              const int num_elems,
-                              const float * restrict theta_cache) {
+                                   float * restrict dst,
+                                   const int num_elems,
+                                   const float * restrict theta_cache) {
    // for (int i = 0; i < num_elems; i += 2) {
    //const float cos_theta = theta_cache[i + 0];
    //const float sin_theta = theta_cache[i + 1];
@@ -192,7 +193,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
        HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);

-        *(HVX_Vector *) dst_curr          = Q6_Vsf_equals_Vqf32(v4);
+        *(HVX_Vector *) dst_curr               = Q6_Vsf_equals_Vqf32(v4);
        *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);

        src0_curr += VLEN;
@@ -259,7 +260,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                         const uint32_t       ir1,
                         int                  nth,
                         int                  ith,
-                         int                  opt_path) {
+                         const int            opt_path) {
    struct htp_ops_context * octx = rope_ctx->octx;

    const struct htp_tensor * src0 = &octx->src0;
@@ -267,8 +268,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
    const struct htp_tensor * src2 = &octx->src2;
    struct htp_tensor *       dst  = &octx->dst;

-    const int32_t mode  = rope_ctx->mode;
-    const bool is_neox  = mode & HTP_ROPE_TYPE_NEOX;
+    const int32_t mode    = rope_ctx->mode;
+    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;

    htp_rope_preamble;

@@ -281,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
        freq_factors = (const float *) src2->data;
    }

-    int ir = 0;
-
+    const uint32_t i1_end       = MIN(ir1, ne1);
+    const int32_t  half_dims    = rope_ctx->n_dims / 2;
+    const size_t   remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
    for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
        for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
            const int32_t p = pos[i2];
@@ -290,14 +292,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
            rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
                            rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);

-            for (uint32_t i1 = 0; i1 < ne1; i1++) {  // attn-heads
-                if (ir++ < ir0) {
-                    continue;
-                }
-                if (ir > ir1) {
-                    break;
-                }
-
+            for (uint32_t i1 = ir0; i1 < i1_end; i1++) {  // attn-heads
                const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
                float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);

@@ -310,6 +305,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                    } else {
                        hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
                    }
+
+                    src_loc += rope_ctx->n_dims;
+                    dst_data_loc += rope_ctx->n_dims;
                } else {
                    for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
                        const float cos_theta = wp0[i0 + 0];
@@ -317,10 +315,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,

                        if (is_neox) {
                            const float x0 = src_loc[0];
-                            const float x1 = src_loc[rope_ctx->n_dims/2];
+                            const float x1 = src_loc[half_dims];

-                            dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
+                            dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
+                            dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;

                            src_loc += 1;
                            dst_data_loc += 1;
@@ -335,15 +333,13 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                            dst_data_loc += 2;
                        }
                    }
+
+                    src_loc += (is_neox ? half_dims : 0);
+                    dst_data_loc += (is_neox ? half_dims : 0);
                }

-                for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
-                    dst_data_loc[0] = src_loc[0];
-                    dst_data_loc[1] = src_loc[1];
-
-                    src_loc += 2;
-                    dst_data_loc += 2;
-                }
+                // TODO: use simd to speed up the remaining elements copy
+                memcpy(dst_data_loc, src_loc, remain_bytes);
            }
        }
    }
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
    udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
    udata->output    .resize(n_tokens);

+    udata->seq_id_data.reserve(n_tokens);
+
    seq_set_t seq_set_unq;

    for (size_t i = 0; i < idxs.size(); ++i) {
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }

        udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
-        udata->seq_id[i]   = batch.seq_id[idxs[i]];
        udata->output[i]   = batch.logits[idxs[i]];

        for (int s = 0; s < udata->n_seq_id[i]; ++s) {
-            seq_set_unq.set(udata->seq_id[i][s]);
+            const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
+
+            udata->seq_id_data.push_back(seq_id);
+            seq_set_unq.set(seq_id);
        }

        if (udata->output[i]) {
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }
    }

+    llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        udata->seq_id[i] = seq_id_ptr;
+        seq_id_ptr += udata->n_seq_id[i];
+    }
+
    for (uint32_t s = 0; s < n_seq_max; ++s) {
        if (seq_set_unq.test(s)) {
            udata->seq_idx[s] = udata->seq_id_unq.size();
@@ -56,13 +56,15 @@ struct llama_ubatch {
        std::vector<float>          embd;
        std::vector<llama_pos>      pos;
        std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
        std::vector<llama_seq_id>   seq_id_unq;
        std::vector<int32_t>        seq_idx;
        std::vector<int8_t>         output;
+
+        std::vector<llama_seq_id> seq_id_data;
    };

-    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
+    // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
    std::shared_ptr<data_t> data;
 };

@@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    freq_base        (cparams.rope_freq_base),
    freq_scale       (cparams.rope_freq_scale),
    ext_factor       (cparams.yarn_ext_factor),
-    attn_factor      (cparams.yarn_attn_factor),
+    attn_factor      (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)),
    beta_fast        (cparams.yarn_beta_fast),
    beta_slow        (cparams.yarn_beta_slow),
    norm_eps         (hparams.f_norm_eps),
@@ -1,7 +1,9 @@
 #include "llama-hparams.h"

 #include "ggml.h"
+
 #include <cassert>
+#include <cmath>

 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
    if (dense_first) {
@@ -229,3 +231,13 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama

    return false;
 }
+
+float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
+    GGML_ASSERT(ext_factor >= 0.0f);
+
+    if (ext_factor != 0.0f) {
+        attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    }
+
+    return attn_factor;
+}
@@ -107,6 +107,7 @@ struct llama_hparams {
    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
    float    rope_freq_scale_train_swa;
+
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;

@@ -267,7 +268,13 @@ struct llama_hparams {
    // TODO: think of a better place for this function
    // TODO: pack the SWA params in a struct?
    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
+
+    // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
+    // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
+    //
+    // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
+    //      https://github.com/ggml-org/llama.cpp/pull/17945
+    static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
 };

 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-
@@ -1369,9 +1369,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                      float   freq_scale) const {
    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;

-    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
+    const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);

    const auto & n_rot     = hparams.n_rot;
    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@@ -1382,12 +1383,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                                ? LLAMA_ROPE_TYPE_NEOX
                                : hparams.rope_type;

-    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
-    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
-                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
-                                    : cparams.yarn_attn_factor;
-
    ggml_tensor * tmp;

    if (ggml_is_quantized(cur->type)) {
@@ -1635,7 +1635,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    // that have no expert_gating_func model parameter set
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
                }
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
+
+                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
+                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+                    // cancel the factor from the convert script
+                    hparams.rope_yarn_log_mul /= 0.1f;
+                }

                // (optional) temperature tuning - used by mistral-large
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
@@ -2267,9 +2272,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);

-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);

                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
                if (hparams.f_attn_temp_scale != 0.0f) {
@@ -2279,18 +2284,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    }
                }

-                // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
-                //       but may need further verification with other values
-                if (hparams.rope_yarn_log_mul != 0.0f) {
-                    float factor = 1.0f / hparams.rope_freq_scale_train;
-                    float mscale = 1.0f;
-                    float mscale_all_dims = hparams.rope_yarn_log_mul;
-                    static auto get_mscale = [](float scale, float mscale) {
-                        return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
-                    };
-                    hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
-                }
-
                switch (hparams.n_layer) {
                    case 26: type = LLM_TYPE_3B; break;
                    case 34: type = LLM_TYPE_8B; break;
@@ -2301,6 +2294,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
        default: throw std::runtime_error("unsupported model architecture");
    }

+    // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
+    if (hparams.rope_yarn_log_mul != 0.0f) {
+        const float factor = 1.0f / hparams.rope_freq_scale_train;
+
+        // note: here we assume `mscale == 1.0f`
+        // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
+              float mscale          = 1.0f;
+        const float mscale_all_dims = hparams.rope_yarn_log_mul;
+
+        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+        // special-case DEEPSEEK v2:
+        // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
+        if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
+            mscale = mscale_all_dims;
+        }
+
+        static auto get_mscale = [](float scale, float mscale) {
+            return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
+        };
+
+        hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
+
+        LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
+                __func__, hparams.yarn_attn_factor, mscale, mscale_all_dims);
+    }
+
    pimpl->n_bytes = ml.n_bytes;

    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
@@ -6806,6 +6825,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
+        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
        // MRoPE (Multi-axis Rotary Position Embedding) sections
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@@ -6869,7 +6889,6 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
    }

    if (arch == LLM_ARCH_QWEN2MOE) {
@@ -1,7 +1,5 @@
 #include "models.h"

-
-
 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr

    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float mscale      = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
-    const float kq_scale    = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
-    const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+
+    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+    GGML_ASSERT(ext_factor >= 0.0f);
+    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+    // use the original attn_factor to pre-scale the kq_scale
+    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));

    ggml_tensor * cur;
    ggml_tensor * inpL;
@@ -7861,9 +7861,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));

    for (bool v : {false, true}) {
        for (bool circular : {false, true}) {
@@ -8064,12 +8079,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));

-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
    // qwen3next with CHUNK_SIZE 64
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
    // qwen3next with CHUNK_SIZE 128
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));

    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
@@ -53,6 +53,15 @@ if (TARGET BUILD_INFO)
    add_dependencies(mtmd-helper BUILD_INFO)
 endif()

+# if mtmd is linked against common, we throw an error
+if (TARGET mtmd)
+    get_target_property(libs mtmd LINK_LIBRARIES)
+    if (libs AND "common" IN_LIST libs)
+        message(FATAL_ERROR "mtmd is designed to be a public library.\n"
+                            "It must not link against common")
+    endif()
+endif()
+
 add_executable(llama-llava-cli    deprecation-warning.cpp)
 add_executable(llama-gemma3-cli   deprecation-warning.cpp)
 add_executable(llama-minicpmv-cli deprecation-warning.cpp)
@@ -13,6 +13,8 @@

 // Internal header for clip.cpp

+#define MTMD_INTERNAL_HEADER
+
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
@@ -595,11 +595,12 @@ struct clip_graph {
            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);

-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_1_b);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_2_b);
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_2_w, model.mm_2_b,
+                FFN_GELU,
+                -1);

        } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
            cur = build_ffn(cur,
@@ -667,16 +668,12 @@ struct clip_graph {

        // LlavaMultiModalProjector (always using GELU activation)
        {
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            if (model.mm_1_b) {
-                cur = ggml_add(ctx0, cur, model.mm_1_b);
-            }
-
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
-            if (model.mm_2_b) {
-                cur = ggml_add(ctx0, cur, model.mm_2_b);
-            }
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_2_w, model.mm_2_b,
+                FFN_GELU,
+                -1);
        }

        // arrangement of the [IMG_BREAK] token
@@ -866,16 +863,12 @@ struct clip_graph {
        // multimodal projection
        ggml_tensor * embeddings = inpL;
        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
-
-        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-
-        // GELU activation
-        embeddings = ggml_gelu(ctx0, embeddings);
-
-        // Second linear layer
-        embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+        embeddings = build_ffn(embeddings,
+                            model.mm_0_w, model.mm_0_b,
+                            nullptr, nullptr,
+                            model.mm_1_w, model.mm_1_b,
+                            FFN_GELU,
+                            -1);

        if (use_window_attn) {
            window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
@@ -1253,11 +1246,12 @@ struct clip_graph {
            // projector LayerNorm uses pytorch's default eps = 1e-5
            // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
            cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_1_b);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_3_b);
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_3_w, model.mm_3_b,
+                FFN_GELU,
+                -1);
        }

        // build the graph
@@ -1408,11 +1402,12 @@ struct clip_graph {
            cb(cur, "proj_inp_normed", -1);

            // projection mlp
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_1_b);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_2_b);
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_2_w, model.mm_2_b,
+                FFN_GELU,
+                -1);
            cb(cur, "proj_out", -1);
        }

@@ -1883,9 +1878,12 @@ struct clip_graph {

        } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
            // projector
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_gelu_erf(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_2_w, model.mm_2_b,
+                FFN_GELU_ERF,
+                -1);

        } else {
            GGML_ABORT("%s: unknown projector type", __func__);
@@ -2070,34 +2068,66 @@ private:

            // self-attention
            {
-                ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
-                if (layer.q_b) {
-                    Qcur = ggml_add(ctx0, Qcur, layer.q_b);
-                }
+                ggml_tensor * Qcur = nullptr;
+                ggml_tensor * Kcur = nullptr;
+                ggml_tensor * Vcur = nullptr;
+                if (layer.qkv_w != nullptr) {
+                    // fused qkv
+                    cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+                    if (layer.qkv_b != nullptr) {
+                        cur = ggml_add(ctx0, cur, layer.qkv_b);
+                    }

-                ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
-                if (layer.k_b) {
-                    Kcur = ggml_add(ctx0, Kcur, layer.k_b);
-                }
+                    Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                        /* nb1    */ ggml_row_size(cur->type, d_head),
+                        /* nb2    */ cur->nb[1],
+                        /* offset */ 0);

-                ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
-                if (layer.v_b) {
-                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
-                }
+                    Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                        /* nb1    */ ggml_row_size(cur->type, d_head),
+                        /* nb2    */ cur->nb[1],
+                        /* offset */ ggml_row_size(cur->type, n_embd));

-                if (layer.q_norm) {
-                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
-                    cb(Qcur, "Qcur_norm", il);
-                }
+                    Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                        /* nb1    */ ggml_row_size(cur->type, d_head),
+                        /* nb2    */ cur->nb[1],
+                        /* offset */ ggml_row_size(cur->type, 2 * n_embd));

-                if (layer.k_norm) {
-                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
-                    cb(Kcur, "Kcur_norm", il);
-                }
+                    // TODO: q/k norm requires row size == n_embd, while here it's d_head
+                    // we can add support in the future if needed
+                    GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);

-                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+                } else {
+                    // separate q, k, v
+                    Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+                    if (layer.q_b) {
+                        Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+                    }
+
+                    Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+                    if (layer.k_b) {
+                        Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+                    }
+
+                    Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+                    if (layer.v_b) {
+                        Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+                    }
+
+                    if (layer.q_norm) {
+                        Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+                        cb(Qcur, "Qcur_norm", il);
+                    }
+
+                    if (layer.k_norm) {
+                        Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+                        cb(Kcur, "Kcur_norm", il);
+                    }
+
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+                    Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+                }

                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
@@ -7,6 +7,8 @@

 // !!! Internal header, to be used by mtmd only !!!

+#define MTMD_INTERNAL_HEADER
+
 struct clip_ctx;

 struct clip_image_size {
@@ -6,6 +6,8 @@
 #include <vector>
 #include <string>

+#define MTMD_INTERNAL_HEADER
+
 #define WHISPER_ASSERT GGML_ASSERT

 #define WHISPER_SAMPLE_RATE 16000
@@ -318,7 +318,9 @@ int main(int argc, char ** argv) {
        g_is_generating = true;
        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
            for (size_t i = 0; i < params.image.size(); i++) {
-                params.prompt += mtmd_default_marker();
+                // most models require the marker before each image
+                // ref: https://github.com/ggml-org/llama.cpp/pull/17616
+                params.prompt = mtmd_default_marker() + params.prompt;
            }
        }
        common_chat_msg msg;
@@ -32,6 +32,10 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb/stb_image.h"

+#ifdef MTMD_INTERNAL_HEADER
+#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
+#endif
+
 //
 // internal logging functions
 //
@@ -32,23 +32,32 @@ fi

 arr_prefix=()
 arr_hf=()
-arr_tmpl=() # chat template
+arr_extra_args=()
 arr_file=()

 add_test_vision() {
    local hf=$1
-    local tmpl=${2:-""} # default to empty string if not provided
+    shift
+    local extra_args=""
+    if [ $# -gt 0 ]; then
+        extra_args=$(printf " %q" "$@")
+    fi
    arr_prefix+=("[vision]")
    arr_hf+=("$hf")
-    arr_tmpl+=("$tmpl")
+    arr_extra_args+=("$extra_args")
    arr_file+=("test-1.jpeg")
 }

 add_test_audio() {
    local hf=$1
+    shift
+    local extra_args=""
+    if [ $# -gt 0 ]; then
+        extra_args=$(printf " %q" "$@")
+    fi
    arr_prefix+=("[audio] ")
    arr_hf+=("$hf")
-    arr_tmpl+=("") # no need for chat tmpl
+    arr_extra_args+=("$extra_args")
    arr_file+=("test-2.mp3")
 }

@@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
 add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
 add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
 add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
-add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
-add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
-add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M"         "vicuna"
+add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
+add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
+add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
 add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
 add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
 add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
@@ -79,7 +88,7 @@ add_test_audio  "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
 # to test the big models, run: ./tests.sh big
 if [ "$RUN_BIG_TESTS" = true ]; then
    add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
-    add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
+    add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
    add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
    add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
    add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
@@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
    add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
    add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
    # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
-    add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
+    # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working

    add_test_audio  "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
    add_test_audio  "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
@@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
    bin="llama-mtmd-cli"
    prefix="${arr_prefix[$i]}"
    hf="${arr_hf[$i]}"
-    tmpl="${arr_tmpl[$i]}"
+    extra_args="${arr_extra_args[$i]}"
    inp_file="${arr_file[$i]}"

    echo "Running test with binary: $bin and HF model: $hf"
    echo ""
    echo ""

-    output=$(\
-        "$PROJ_ROOT/build/bin/$bin" \
-        -hf "$hf" \
-        --image $SCRIPT_DIR/$inp_file \
-        -p "what is the publisher name of the newspaper?" \
+    cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
+        -hf $(printf %q "$hf") \
+        --image $(printf %q "$SCRIPT_DIR/$inp_file") \
        --temp 0 -n 128 \
-        ${tmpl:+--chat-template "$tmpl"} \
-        2>&1 | tee /dev/tty)
+        ${extra_args}"
+
+    # if extra_args does not contain -p, we add a default prompt
+    if ! [[ "$extra_args" =~ "-p" ]]; then
+        cmd+=" -p \"what is the publisher name of the newspaper?\""
+    fi
+
+    output=$(eval "$cmd" 2>&1 | tee /dev/tty)

    echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log

@@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
    if echo "$output" | grep -iq "new york" \
            || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
    then
-        result="$prefix \033[32mOK\033[0m:   $bin $hf"
+        result="$prefix \033[32mOK\033[0m:   $hf"
    else
-        result="$prefix \033[31mFAIL\033[0m: $bin $hf"
+        result="$prefix \033[31mFAIL\033[0m: $hf"
    fi
    echo -e "$result"
    arr_res+=("$result")
@@ -166,8 +166,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `--mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
-| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
+| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
+| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
 | `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
 | `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
@@ -41,7 +41,7 @@
 				"@tailwindcss/vite": "^4.0.0",
 				"@types/node": "^22",
 				"@vitest/browser": "^3.2.3",
-				"bits-ui": "^2.8.11",
+				"bits-ui": "^2.14.4",
 				"clsx": "^2.1.1",
 				"dexie": "^4.0.11",
 				"eslint": "^9.18.0",
@@ -3343,17 +3343,17 @@
 			}
 		},
 		"node_modules/bits-ui": {
-			"version": "2.8.11",
-			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz",
-			"integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==",
+			"version": "2.14.4",
+			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
+			"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"@floating-ui/core": "^1.7.1",
 				"@floating-ui/dom": "^1.7.1",
 				"esm-env": "^1.1.2",
-				"runed": "^0.29.1",
-				"svelte-toolbelt": "^0.9.3",
+				"runed": "^0.35.1",
+				"svelte-toolbelt": "^0.10.6",
 				"tabbable": "^6.2.0"
 			},
 			"engines": {
@@ -3368,9 +3368,9 @@
 			}
 		},
 		"node_modules/bits-ui/node_modules/runed": {
-			"version": "0.29.2",
-			"resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz",
-			"integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==",
+			"version": "0.35.1",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
+			"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte",
@@ -3378,23 +3378,31 @@
 			],
 			"license": "MIT",
 			"dependencies": {
-				"esm-env": "^1.0.0"
+				"dequal": "^2.0.3",
+				"esm-env": "^1.0.0",
+				"lz-string": "^1.5.0"
 			},
 			"peerDependencies": {
+				"@sveltejs/kit": "^2.21.0",
 				"svelte": "^5.7.0"
+			},
+			"peerDependenciesMeta": {
+				"@sveltejs/kit": {
+					"optional": true
+				}
 			}
 		},
 		"node_modules/bits-ui/node_modules/svelte-toolbelt": {
-			"version": "0.9.3",
-			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz",
-			"integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==",
+			"version": "0.10.6",
+			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
+			"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte"
 			],
 			"dependencies": {
 				"clsx": "^2.1.1",
-				"runed": "^0.29.0",
+				"runed": "^0.35.1",
 				"style-to-object": "^1.0.8"
 			},
 			"engines": {
@@ -43,7 +43,7 @@
 		"@tailwindcss/vite": "^4.0.0",
 		"@types/node": "^22",
 		"@vitest/browser": "^3.2.3",
-		"bits-ui": "^2.8.11",
+		"bits-ui": "^2.14.4",
 		"clsx": "^2.1.1",
 		"dexie": "^4.0.11",
 		"eslint": "^9.18.0",
@@ -331,6 +331,7 @@
 	class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
 		? 'cursor-not-allowed opacity-60'
 		: ''} {className}"
+	data-slot="chat-form"
 >
 	<ChatAttachmentsList
 		bind:uploadedFiles
@@ -1,6 +1,5 @@
 <script lang="ts">
-	import { Input } from '$lib/components/ui/input';
-	import { Search } from '@lucide/svelte';
+	import { SearchInput } from '$lib/components/app';

 	interface Props {
 		value?: string;
@@ -15,19 +14,6 @@
 		onInput,
 		class: className
 	}: Props = $props();
-
-	function handleInput(event: Event) {
-		const target = event.target as HTMLInputElement;
-
-		value = target.value;
-		onInput?.(target.value);
-	}
 </script>

-<div class="relative mb-4 {className}">
-	<Search
-		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
-	/>
-
-	<Input bind:value class="pl-10" oninput={handleInput} {placeholder} type="search" />
-</div>
+<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />
@@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt
 export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
 export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
 export { default as RemoveButton } from './misc/RemoveButton.svelte';
+export { default as SearchInput } from './misc/SearchInput.svelte';
 export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
 export { default as ModelsSelector } from './models/ModelsSelector.svelte';

@@ -0,0 +1,73 @@
+<script lang="ts">
+	import { Input } from '$lib/components/ui/input';
+	import { Search, X } from '@lucide/svelte';
+
+	interface Props {
+		value?: string;
+		placeholder?: string;
+		onInput?: (value: string) => void;
+		onClose?: () => void;
+		onKeyDown?: (event: KeyboardEvent) => void;
+		class?: string;
+		id?: string;
+		ref?: HTMLInputElement | null;
+	}
+
+	let {
+		value = $bindable(''),
+		placeholder = 'Search...',
+		onInput,
+		onClose,
+		onKeyDown,
+		class: className,
+		id,
+		ref = $bindable(null)
+	}: Props = $props();
+
+	let showClearButton = $derived(!!value || !!onClose);
+
+	function handleInput(event: Event) {
+		const target = event.target as HTMLInputElement;
+
+		value = target.value;
+		onInput?.(target.value);
+	}
+
+	function handleClear() {
+		if (value) {
+			value = '';
+			onInput?.('');
+			ref?.focus();
+		} else {
+			onClose?.();
+		}
+	}
+</script>
+
+<div class="relative {className}">
+	<Search
+		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
+	/>
+
+	<Input
+		{id}
+		bind:value
+		bind:ref
+		class="pl-9 {showClearButton ? 'pr-9' : ''}"
+		oninput={handleInput}
+		onkeydown={onKeyDown}
+		{placeholder}
+		type="search"
+	/>
+
+	{#if showClearButton}
+		<button
+			type="button"
+			class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
+			onclick={handleClear}
+			aria-label={value ? 'Clear search' : 'Close'}
+		>
+			<X class="h-4 w-4" />
+		</button>
+	{/if}
+</div>
@@ -2,8 +2,8 @@
 	import { onMount, tick } from 'svelte';
 	import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
 	import * as Tooltip from '$lib/components/ui/tooltip';
+	import * as Popover from '$lib/components/ui/popover';
 	import { cn } from '$lib/components/ui/utils';
-	import { portalToBody } from '$lib/utils';
 	import {
 		modelsStore,
 		modelOptions,
@@ -17,12 +17,8 @@
 	import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
 	import { ServerModelStatus } from '$lib/enums';
 	import { isRouterMode } from '$lib/stores/server.svelte';
-	import { DialogModelInformation } from '$lib/components/app';
-	import {
-		MENU_MAX_WIDTH,
-		MENU_OFFSET,
-		VIEWPORT_GUTTER
-	} from '$lib/constants/floating-ui-constraints';
+	import { DialogModelInformation, SearchInput } from '$lib/components/app';
+	import type { ModelOption } from '$lib/types/models';

 	interface Props {
 		class?: string;
@@ -145,185 +141,126 @@
 		return options.some((option) => option.model === currentModel);
 	});

-	let isOpen = $state(false);
-	let showModelDialog = $state(false);
-	let container: HTMLDivElement | null = null;
-	let menuRef = $state<HTMLDivElement | null>(null);
-	let triggerButton = $state<HTMLButtonElement | null>(null);
-	let menuPosition = $state<{
-		top: number;
-		left: number;
-		width: number;
-		placement: 'top' | 'bottom';
-		maxHeight: number;
-	} | null>(null);
+	let searchTerm = $state('');
+	let searchInputRef = $state<HTMLInputElement | null>(null);
+	let highlightedIndex = $state<number>(-1);

-	onMount(async () => {
-		try {
-			await modelsStore.fetch();
-		} catch (error) {
-			console.error('Unable to load models:', error);
-		}
+	let filteredOptions: ModelOption[] = $derived(
+		(() => {
+			const term = searchTerm.trim().toLowerCase();
+			if (!term) return options;
+
+			return options.filter(
+				(option) =>
+					option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
+			);
+		})()
+	);
+
+	// Get indices of compatible options for keyboard navigation
+	let compatibleIndices = $derived(
+		filteredOptions
+			.map((option, index) => (isModelCompatible(option) ? index : -1))
+			.filter((i) => i !== -1)
+	);
+
+	// Reset highlighted index when search term changes
+	$effect(() => {
+		void searchTerm;
+		highlightedIndex = -1;
 	});

-	function toggleOpen() {
+	let isOpen = $state(false);
+	let showModelDialog = $state(false);
+
+	onMount(() => {
+		modelsStore.fetch().catch((error) => {
+			console.error('Unable to load models:', error);
+		});
+	});
+
+	function handleOpenChange(open: boolean) {
 		if (loading || updating) return;

-		if (isRouter) {
-			// Router mode: show dropdown
-			if (isOpen) {
-				closeMenu();
-			} else {
-				openMenu();
+		if (open) {
+			isOpen = true;
+			searchTerm = '';
+			highlightedIndex = -1;
+
+			// Focus search input after popover opens
+			tick().then(() => {
+				requestAnimationFrame(() => searchInputRef?.focus());
+			});
+
+			if (isRouter) {
+				modelsStore.fetchRouterModels().then(() => {
+					modelsStore.fetchModalitiesForLoadedModels();
+				});
 			}
 		} else {
-			// Single model mode: show dialog
-			showModelDialog = true;
+			isOpen = false;
+			searchTerm = '';
+			highlightedIndex = -1;
 		}
 	}

-	async function openMenu() {
+	function handleTriggerClick() {
 		if (loading || updating) return;

-		isOpen = true;
-		await tick();
-		updateMenuPosition();
-		requestAnimationFrame(() => updateMenuPosition());
-
-		if (isRouter) {
-			modelsStore.fetchRouterModels().then(() => {
-				modelsStore.fetchModalitiesForLoadedModels();
-			});
+		if (!isRouter) {
+			// Single model mode: show dialog instead of popover
+			showModelDialog = true;
 		}
+		// For router mode, the Popover handles open/close
 	}

 	export function open() {
 		if (isRouter) {
-			openMenu();
+			handleOpenChange(true);
 		} else {
 			showModelDialog = true;
 		}
 	}

 	function closeMenu() {
-		if (!isOpen) return;
-
-		isOpen = false;
-		menuPosition = null;
+		handleOpenChange(false);
 	}

-	function handlePointerDown(event: PointerEvent) {
-		if (!container) return;
+	function handleSearchKeyDown(event: KeyboardEvent) {
+		if (event.isComposing) return;

-		const target = event.target as Node | null;
+		if (event.key === 'ArrowDown') {
+			event.preventDefault();
+			if (compatibleIndices.length === 0) return;

-		if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
-			closeMenu();
-		}
-	}
-
-	function handleKeydown(event: KeyboardEvent) {
-		if (event.key === 'Escape') {
-			closeMenu();
-		}
-	}
-
-	function handleResize() {
-		if (isOpen) {
-			updateMenuPosition();
-		}
-	}
-
-	function updateMenuPosition() {
-		if (!isOpen || !triggerButton || !menuRef) return;
-
-		const triggerRect = triggerButton.getBoundingClientRect();
-		const viewportWidth = window.innerWidth;
-		const viewportHeight = window.innerHeight;
-
-		if (viewportWidth === 0 || viewportHeight === 0) return;
-
-		const scrollWidth = menuRef.scrollWidth;
-		const scrollHeight = menuRef.scrollHeight;
-
-		const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
-		const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
-		const safeMaxWidth =
-			constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
-		const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
-
-		let width = Math.min(
-			Math.max(triggerRect.width, scrollWidth, desiredMinWidth),
-			safeMaxWidth || 320
-		);
-
-		const availableBelow = Math.max(
-			0,
-			viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
-		);
-		const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
-		const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
-		const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
-
-		function computePlacement(placement: 'top' | 'bottom') {
-			const available = placement === 'bottom' ? availableBelow : availableAbove;
-			const allowedHeight =
-				available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
-			const maxHeight = Math.min(scrollHeight, allowedHeight);
-			const height = Math.max(0, maxHeight);
-
-			let top: number;
-			if (placement === 'bottom') {
-				const rawTop = triggerRect.bottom + MENU_OFFSET;
-				const minTop = VIEWPORT_GUTTER;
-				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
-				if (maxTop < minTop) {
-					top = minTop;
-				} else {
-					top = Math.min(Math.max(rawTop, minTop), maxTop);
-				}
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
+			if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
+				highlightedIndex = compatibleIndices[0];
 			} else {
-				const rawTop = triggerRect.top - MENU_OFFSET - height;
-				const minTop = VIEWPORT_GUTTER;
-				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
-				if (maxTop < minTop) {
-					top = minTop;
-				} else {
-					top = Math.max(Math.min(rawTop, maxTop), minTop);
+				highlightedIndex = compatibleIndices[currentPos + 1];
+			}
+		} else if (event.key === 'ArrowUp') {
+			event.preventDefault();
+			if (compatibleIndices.length === 0) return;
+
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
+			if (currentPos === -1 || currentPos === 0) {
+				highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
+			} else {
+				highlightedIndex = compatibleIndices[currentPos - 1];
+			}
+		} else if (event.key === 'Enter') {
+			event.preventDefault();
+			if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
+				const option = filteredOptions[highlightedIndex];
+				if (isModelCompatible(option)) {
+					handleSelect(option.id);
 				}
-			}
-
-			return { placement, top, height, maxHeight };
-		}
-
-		const belowMetrics = computePlacement('bottom');
-		const aboveMetrics = computePlacement('top');
-
-		let metrics = belowMetrics;
-		if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
-			metrics = aboveMetrics;
-		}
-
-		let left = triggerRect.right - width;
-		const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
-		if (maxLeft < VIEWPORT_GUTTER) {
-			left = VIEWPORT_GUTTER;
-		} else {
-			if (left > maxLeft) {
-				left = maxLeft;
-			}
-			if (left < VIEWPORT_GUTTER) {
-				left = VIEWPORT_GUTTER;
+			} else if (compatibleIndices.length > 0) {
+				// No selection - highlight first compatible option
+				highlightedIndex = compatibleIndices[0];
 			}
 		}
-
-		menuPosition = {
-			top: Math.round(metrics.top),
-			left: Math.round(left),
-			width: Math.round(width),
-			placement: metrics.placement,
-			maxHeight: Math.round(metrics.maxHeight)
-		};
 	}

 	async function handleSelect(modelId: string) {
@@ -356,6 +293,14 @@

 		if (shouldCloseMenu) {
 			closeMenu();
+
+			// Focus the chat textarea after model selection
+			requestAnimationFrame(() => {
+				const textarea = document.querySelector<HTMLTextAreaElement>(
+					'[data-slot="chat-form"] textarea'
+				);
+				textarea?.focus();
+			});
 		}
 	}

@@ -404,10 +349,7 @@
 	}
 </script>

-<svelte:window onresize={handleResize} />
-<svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
-
-<div class={cn('relative inline-flex flex-col items-end gap-1', className)} bind:this={container}>
+<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
 	{#if loading && options.length === 0 && isRouter}
 		<div class="flex items-center gap-2 text-xs text-muted-foreground">
 			<Loader2 class="h-3.5 w-3.5 animate-spin" />
@@ -418,9 +360,8 @@
 	{:else}
 		{@const selectedOption = getDisplayOption()}

-		<div class="relative">
-			<button
-				type="button"
+		<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
+			<Popover.Trigger
 				class={cn(
 					`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
 					!isCurrentModelInCache()
@@ -430,15 +371,11 @@
 							: isHighlightedCurrentModelActive
 								? 'text-foreground'
 								: 'text-muted-foreground',
-					isOpen ? 'text-foreground' : '',
-					className
+					isOpen ? 'text-foreground' : ''
 				)}
 				style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
-				aria-haspopup={isRouter ? 'listbox' : undefined}
-				aria-expanded={isRouter ? isOpen : undefined}
-				onclick={toggleOpen}
-				bind:this={triggerButton}
-				disabled={disabled || updating}
+				onclick={handleTriggerClick}
+				disabled={disabled || updating || !isRouter}
 			>
 				<Package class="h-3.5 w-3.5" />

@@ -451,33 +388,35 @@
 				{:else if isRouter}
 					<ChevronDown class="h-3 w-3.5" />
 				{/if}
-			</button>
+			</Popover.Trigger>

-			{#if isOpen && isRouter}
-				<div
-					bind:this={menuRef}
-					use:portalToBody
-					class={cn(
-						'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
-						menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
-					)}
-					role="listbox"
-					style:top={menuPosition ? `${menuPosition.top}px` : undefined}
-					style:left={menuPosition ? `${menuPosition.left}px` : undefined}
-					style:width={menuPosition ? `${menuPosition.width}px` : undefined}
-					data-placement={menuPosition?.placement ?? 'bottom'}
-				>
+			<Popover.Content
+				class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
+				align="end"
+				sideOffset={8}
+				collisionPadding={16}
+			>
+				<div class="flex max-h-[50dvh] flex-col overflow-hidden">
 					<div
-						class="overflow-y-auto py-1"
-						style:max-height={menuPosition && menuPosition.maxHeight > 0
-							? `${menuPosition.maxHeight}px`
-							: undefined}
+						class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
+					>
+						<SearchInput
+							id="model-search"
+							placeholder="Search models..."
+							bind:value={searchTerm}
+							bind:ref={searchInputRef}
+							onClose={closeMenu}
+							onKeyDown={handleSearchKeyDown}
+						/>
+					</div>
+					<div
+						class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
 					>
 						{#if !isCurrentModelInCache() && currentModel}
 							<!-- Show unavailable model as first option (disabled) -->
 							<button
 								type="button"
-								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-3 py-2 text-left text-sm text-red-400"
+								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
 								role="option"
 								aria-selected="true"
 								aria-disabled="true"
@@ -488,20 +427,25 @@
 							</button>
 							<div class="my-1 h-px bg-border"></div>
 						{/if}
-						{#each options as option (option.id)}
+						{#if filteredOptions.length === 0}
+							<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
+						{/if}
+						{#each filteredOptions as option, index (option.id)}
 							{@const status = getModelStatus(option.model)}
 							{@const isLoaded = status === ServerModelStatus.LOADED}
 							{@const isLoading = status === ServerModelStatus.LOADING}
 							{@const isSelected = currentModel === option.model || activeId === option.id}
 							{@const isCompatible = isModelCompatible(option)}
+							{@const isHighlighted = index === highlightedIndex}
 							{@const missingModalities = getMissingModalities(option)}
+
 							<div
 								class={cn(
-									'group flex w-full items-center gap-2 px-3 py-2 text-left text-sm transition focus:outline-none',
+									'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
 									isCompatible
 										? 'cursor-pointer hover:bg-muted focus:bg-muted'
 										: 'cursor-not-allowed opacity-50',
-									isSelected
+									isSelected || isHighlighted
 										? 'bg-accent text-accent-foreground'
 										: isCompatible
 											? 'hover:bg-accent hover:text-accent-foreground'
@@ -509,10 +453,11 @@
 									isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
 								)}
 								role="option"
-								aria-selected={isSelected}
+								aria-selected={isSelected || isHighlighted}
 								aria-disabled={!isCompatible}
 								tabindex={isCompatible ? 0 : -1}
 								onclick={() => isCompatible && handleSelect(option.id)}
+								onmouseenter={() => (highlightedIndex = index)}
 								onkeydown={(e) => {
 									if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
 										e.preventDefault();
@@ -586,8 +531,8 @@
 						{/each}
 					</div>
 				</div>
-			{/if}
-		</div>
+			</Popover.Content>
+		</Popover.Root>
 	{/if}
 </div>

@@ -0,0 +1,19 @@
+import Root from './popover.svelte';
+import Close from './popover-close.svelte';
+import Content from './popover-content.svelte';
+import Trigger from './popover-trigger.svelte';
+import Portal from './popover-portal.svelte';
+
+export {
+	Root,
+	Content,
+	Trigger,
+	Close,
+	Portal,
+	//
+	Root as Popover,
+	Content as PopoverContent,
+	Trigger as PopoverTrigger,
+	Close as PopoverClose,
+	Portal as PopoverPortal
+};
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
+</script>
+
+<PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />
@@ -0,0 +1,37 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+	import PopoverPortal from './popover-portal.svelte';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+	import type { ComponentProps } from 'svelte';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		sideOffset = 4,
+		side,
+		align = 'center',
+		collisionPadding = 8,
+		avoidCollisions = true,
+		portalProps,
+		...restProps
+	}: PopoverPrimitive.ContentProps & {
+		portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
+	} = $props();
+</script>
+
+<PopoverPortal {...portalProps}>
+	<PopoverPrimitive.Content
+		bind:ref
+		data-slot="popover-content"
+		{sideOffset}
+		{side}
+		{align}
+		{collisionPadding}
+		{avoidCollisions}
+		class={cn(
+			'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
+			className
+		)}
+		{...restProps}
+	/>
+</PopoverPortal>
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { ...restProps }: PopoverPrimitive.PortalProps = $props();
+</script>
+
+<PopoverPrimitive.Portal {...restProps} />
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { cn } from '$lib/components/ui/utils.js';
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: PopoverPrimitive.TriggerProps = $props();
+</script>
+
+<PopoverPrimitive.Trigger
+	bind:ref
+	data-slot="popover-trigger"
+	class={cn('', className)}
+	{...restProps}
+/>
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { open = $bindable(false), ...restProps }: PopoverPrimitive.RootProps = $props();
+</script>
+
+<PopoverPrimitive.Root bind:open {...restProps} />
@@ -1,3 +1,2 @@
 export const VIEWPORT_GUTTER = 8;
 export const MENU_OFFSET = 6;
-export const MENU_MAX_WIDTH = 320;
@@ -295,14 +295,21 @@ class ModelsStore {
 	 * Fetch props for a specific model from /props endpoint
 	 * Uses caching to avoid redundant requests
 	 *
+	 * In ROUTER mode, this will only fetch props if the model is loaded,
+	 * since unloaded models return 400 from /props endpoint.
+	 *
 	 * @param modelId - Model identifier to fetch props for
-	 * @returns Props data or null if fetch failed
+	 * @returns Props data or null if fetch failed or model not loaded
 	 */
 	async fetchModelProps(modelId: string): Promise<ApiLlamaCppServerProps | null> {
 		// Return cached props if available
 		const cached = this.modelPropsCache.get(modelId);
 		if (cached) return cached;

+		if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) {
+			return null;
+		}
+
 		// Avoid duplicate fetches
 		if (this.modelPropsFetching.has(modelId)) return null;

@@ -303,6 +303,27 @@ $$\n\\pi_n(\\mathbb{S}^3) = \\begin{cases}
 		expect(output).toBe(input); // Code blocks prevent misinterpretation
 	});

+	test('preserves backslash parentheses in code blocks (GitHub issue)', () => {
+		const input = '```python\nfoo = "\\(bar\\)"\n```';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
+	});
+
+	test('preserves backslash brackets in code blocks', () => {
+		const input = '```python\nfoo = "\\[bar\\]"\n```';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
+	});
+
+	test('preserves backslash parentheses in inline code', () => {
+		const input = 'Use `foo = "\\(bar\\)"` in your code.';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input);
+	});
+
 	test('escape backslash in mchem ce', () => {
 		const input = 'mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$';
 		const output = preprocessLaTeX(input);
@@ -226,19 +226,16 @@ export function preprocessLaTeX(content: string): string {
 		return expr;
 	});

-	// Step 5: Restore code blocks
-	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
-		return codeBlocks[parseInt(index)];
-	});
-
-	// Step 6: Apply additional escaping functions (brackets and mhchem)
+	// Step 5: Apply additional escaping functions (brackets and mhchem)
+	// This must happen BEFORE restoring code blocks to avoid affecting code content
 	content = escapeBrackets(content);

 	if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
 		content = escapeMhchem(content);
 	}

-	// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
+	// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
+	// This must happen BEFORE restoring code blocks to avoid affecting code content
 	content = content
 		// Using the look‑behind pattern `(?<!\\)` we skip matches
 		// that are preceded by a backslash, e.g.
@@ -248,12 +245,18 @@ export function preprocessLaTeX(content: string): string {
 			// Using the look‑behind pattern `(?<!\\)` we skip matches
 			// that are preceded by a backslash, e.g. `\\[4pt]`.
 			/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
-			(_, prefix: string, content: string) => {
-				return `${prefix}$$${content}$$`;
+			(_, content: string) => {
+				return `$$${content}$$`;
 			}
 		);

-	// Step 7: Restore blockquote markers
+	// Step 7: Restore code blocks
+	// This happens AFTER all LaTeX conversions to preserve code content
+	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
+		return codeBlocks[parseInt(index)];
+	});
+
+	// Step 8: Restore blockquote markers
 	if (blockquoteMarkers.size > 0) {
 		const finalLines = content.split('\n');
 		const restoredLines = finalLines.map((line, index) => {
@@ -9,6 +9,10 @@ if (NOT MSVC)
 endif()

 target_link_libraries  (${TARGET} PRIVATE Threads::Threads)
+
+if (WIN32 AND NOT MSVC)
+    target_link_libraries(${TARGET} PUBLIC ws2_32)
+endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 target_compile_definitions(${TARGET} PRIVATE
Author	SHA1	Message	Date
Johannes Gäßler	482211438d	CUDA: fix overflow in MMA kernel without stream-k (#17939 )	2025-12-12 17:43:58 +01:00
Georgi Gerganov	7bed317f53	models : fix the attn_factor for mistral3 graphs + improve consistency (#17945 ) * models : fix the attn_factor for mistral3 graphs * cont : rework attn_factor correction logic * cont : make deepseek2 consistent * cont : add TODO * cont : special-case DSv2 * cont : revert Mistral 3 Large changes * cont : fix DS2 to use the original attn_factor * cont : minor comments	2025-12-12 17:12:40 +02:00
Sigbjørn Skjæret	dcb7d17758	cann : fix ops broken by circular padding guard (#17825 )	2025-12-12 15:49:27 +01:00
ixgbe	51604435e8	ggml-cpu : fix RISC-V Q4_0 repack select and RVV feature reporting (#17951 ) * ggml-cpu:fix RISC-V Q4_0 repack select and RVV feature reporting Signed-off-by: Wang Yang <yangwang@iscas.ac.cn> * using the name VLEN instead of CNT * Update ggml/include/ggml-cpu.h --------- Signed-off-by: Wang Yang <yangwang@iscas.ac.cn> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-12-12 16:26:03 +02:00
Xuan-Son Nguyen	17158965ac	mtmd: explicitly forbidden inclusion of private header and libcommon (#17946 )	2025-12-12 15:16:06 +01:00
Aleksander Grygier	12280ae905	webui: Fix parsing non-LaTeX occurrencies of `\(` or `\)` (#17810 ) * fix: Improve latex protection logic to prevent turning non-latex `\(` into `$` * chore: update webui build output	2025-12-12 15:13:36 +01:00
Xuan-Son Nguyen	54a0fee4b7	arg: add -mm and -mmu as short form of --mmproj and --mmproj-url (#17958 ) * arg: add -mm and -mmu as short form of --mmproj and --mmproj-url * correct order * update docs	2025-12-12 14:06:06 +01:00
Daniel Bevenius	dada4c846d	model-conversion : remove max diff check in compare-logits [no ci] (#17954 ) This commit removes the maximum difference check from the compare-logits.py which would stop early if the difference between the logits exceeded a threshold. The motivation for removing this is that it can be useful to be able to get the complete log for debugging/reporting purposes.	2025-12-12 13:25:16 +01:00
Adrien Gallouët	b8ee22cfde	common : add minimalist multi-thread progress bar (#17602 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2025-12-12 12:44:35 +01:00
Gustavo Rocha Dias	2eaa2c65cb	cmake: link ws2_32 for MinGW/w64devkit builds in cpp-httplib (#17949 )	2025-12-12 12:02:28 +01:00
yulo	c33a58bced	HIP: enable mmf for RDNA3 (#17879 ) * enable mmf for RDNA3 * disable mmf for some shape * move some mmvf to mmf * more mmfv to mmf * 3 is good in mmvf --------- Co-authored-by: zhang hui <you@example.com>	2025-12-12 11:34:33 +01:00
Pascal	a81a569577	Add a search field on model selector / improve mobile display (#17765 ) * webui: add search field to model selector and fixes mobile viewport overflow * webui: simplify model search style and code * refacor: Search Input component & consistent UI for Models Selector search * feat: Use Popover component + improve interactions * fix: Fetching props for only loaded models in ROUTER mode * webui: prevent models selector popover from overflowing viewport Use Floating UI's auto-positioning with 50dvh height limit and proper collision detection instead of forcing top positioning. Fixes overflow on desktop and mobile keyboard issues * webui: keep search field near trigger in models selector Place search at the 'near end' (closest to trigger) by swapping layout with CSS flexbox order based on popover direction. Prevents input from moving during typing as list shrinks * chore: update webui build output --------- Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>	2025-12-11 18:21:21 +01:00
Piotr Wilkin (ilintar)	53ecd4fdb9	SOLVE_TRI extension to more dimensions (#17793 ) * Extended TRI * Fix whitespace * chore: update webui build output * Just use cuBLAS for everything... * Merge both versions * Remove incorrect imports causing failures for CI * Still failing... remove all direct cublas imports and rely on common imports from "common.cuh" * Defines for hipBlas * Aaaand MUSA defines... * I hate this job... * Stupid typo... * Update ggml/src/ggml-cuda/solve_tri.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>	2025-12-11 17:20:43 +01:00
Georgi Gerganov	c6f6e4f96a	ggml-alloc : fix reuse-parent logic for misaligned sizes (#17884 )	2025-12-11 14:30:10 +02:00
Georgi Gerganov	d9f8f60618	batch : fix sequence id ownership (#17915 ) * batch : fix sequence id ownage * cont : reduce allocations	2025-12-11 14:29:47 +02:00
Yuichiro Utsumi	e4ae383317	docs: use port 8080 in Docker examples (#17903 )	2025-12-11 17:12:07 +08:00
nullname	34ce48d97a	ggml-hexagon: fix `rope` failure at `test-backend-ops` (#17565 ) * fix test failure * fix: correct scaling calculations in rope_cache_init * fix: optimize element copying in rope_hex_f32 using memcpy * fix: optimize loop boundaries in rope_hex_f32 for better performance * feat: add profiling macros for performance measurement in operations	2025-12-10 14:45:43 -08:00
Sigbjørn Skjæret	45e350e3d3	ci: fix riscv64-native build (#17916 )	2025-12-10 23:24:31 +01:00
Xuan-Son Nguyen	c6b2c9310c	mtmd: some small clean up (#17909 ) * clip: add support for fused qkv in build_vit * use bulid_ffn whenever possible * fix internvl * mtmd-cli: move image to beginning * test script: support custom args	2025-12-10 22:20:06 +01:00