common : cleanup logs and modernize the progress bar (#21215 )

``` $ build/bin/llama-server -hf unsloth/Qwen3.5-0.8B-GGUF common_download_file_single_online: HEAD failed, status: 404 no remote preset found, skipping Downloading mmproj-BF16.gguf ——————————————————————————————————————— 100% Downloading Qwen3.5-0.8B-Q4_K_M.gguf ——————————————————————————————— 100% ... ``` Signed-off-by: Adrien Gallouët <angt@huggingface.co>
CANN: fix multi-thread set_tensor race conditions (#20151 )
2026-07-01 10:07:44 +02:00 · 2026-03-31 16:18:00 +02:00 · 2026-03-31 17:00:51 +03:00 · 2026-03-31 15:44:26 +02:00 · 2026-03-31 13:52:42 +02:00 · 2026-03-31 13:50:51 +02:00
33 changed files with 831 additions and 272 deletions
@@ -21,14 +21,6 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

-[tools/server/public/*]
-indent_size = 2
-
-[tools/server/public/deps_*]
-trim_trailing_whitespace = unset
-indent_style = unset
-indent_size = unset
-
 [tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
@@ -61,6 +53,14 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

+[tools/server/public/**]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
+
 [benches/**]
 indent_style = unset
 indent_size = unset
@@ -0,0 +1,4 @@
+# Treat the generated single-file WebUI build as binary for diff purposes.
+# Git's pack-file delta compression still works (byte-level), but this prevents
+# git diff from printing the entire minified file on every change.
+tools/server/public/index.html -diff
@@ -95,6 +95,8 @@
 # Server Web UI temporary files
 /tools/server/webui/node_modules
 /tools/server/webui/dist
+# we no longer use gz for index.html
+/tools/server/public/index.html.gz

 # Python

@@ -221,7 +221,7 @@ using chat_template_caps = jinja::caps;
 struct common_chat_templates {
    bool add_bos;
    bool add_eos;
-    bool has_explicit_template;  // Model had builtin template or template overridde was specified.
+    bool has_explicit_template;  // Model had builtin template or template overridden was specified.
    std::unique_ptr<common_chat_template> template_default;  // always set (defaults to chatml)
    std::unique_ptr<common_chat_template> template_tool_use;
 };
@@ -989,6 +989,10 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        auto analysis = p.ref("analysis");
        auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
        auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
+
+        // Consume any unsolicited tool calls, e.g. builtin functions
+        auto unsolicited = p.rule("unsolicited", p.atomic(p.optional(channel) + p.literal(" to=") + content + end));
+
        auto any = p.rule("any", preamble | analysis);

        if (has_response_format) {
@@ -1032,7 +1036,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
            return p.zero_or_more(start + any) + start + (tool_call | final_msg);
        }

-        return p.zero_or_more(start + any) + start + final_msg;
+        return p.zero_or_more(start + any) + start + (final_msg | unsolicited);
    });

    data.parser = parser.save();
@@ -119,6 +119,9 @@ class ProgressBar {
    static inline std::map<const ProgressBar *, int> lines;
    static inline int max_line = 0;

+    std::string filename;
+    size_t len = 0;
+
    static void cleanup(const ProgressBar * line) {
        lines.erase(line);
        if (lines.empty()) {
@@ -135,7 +138,23 @@ class ProgressBar {
    }

 public:
-    ProgressBar() = default;
+    ProgressBar(const std::string & url = "") : filename(url) {
+        if (auto pos = filename.rfind('/'); pos != std::string::npos) {
+            filename = filename.substr(pos + 1);
+        }
+        if (auto pos = filename.find('?'); pos != std::string::npos) {
+            filename = filename.substr(0, pos);
+        }
+        for (size_t i = 0; i < filename.size(); ++i) {
+            if ((filename[i] & 0xC0) != 0x80) {
+                if (len++ == 39) {
+                    filename.resize(i);
+                    filename += "…";
+                    break;
+                }
+            }
+        }
+    }

    ~ProgressBar() {
        std::lock_guard<std::mutex> lock(mutex);
@@ -143,11 +162,7 @@ public:
    }

    void update(size_t current, size_t total) {
-        if (!is_output_a_tty()) {
-            return;
-        }
-
-        if (!total) {
+        if (!total || !is_output_a_tty()) {
            return;
        }

@@ -159,28 +174,27 @@ public:
        }
        int lines_up = max_line - lines[this];

-        size_t width = 50;
+        size_t bar = 55 - len;
        size_t pct = (100 * current) / total;
-        size_t pos = (width * current) / total;
-
-        std::cout << "\033[s";
+        size_t pos = (bar * current) / total;

        if (lines_up > 0) {
            std::cout << "\033[" << lines_up << "A";
        }
-        std::cout << "\033[2K\r["
-            << std::string(pos, '=')
-            << (pos < width ? ">" : "")
-            << std::string(width - pos, ' ')
-            << "] " << std::setw(3) << pct << "%  ("
-            << current / (1024 * 1024) << " MB / "
-            << total / (1024 * 1024) << " MB) "
-            << "\033[u";
+        std::cout << '\r' << "Downloading " << filename << " ";

-        std::cout.flush();
+        for (size_t i = 0; i < bar; ++i) {
+            std::cout << (i < pos ? "—" : " ");
+        }
+        std::cout << std::setw(4) << pct << "%\033[K";
+
+        if (lines_up > 0) {
+            std::cout << "\033[" << lines_up << "B";
+        }
+        std::cout << '\r' << std::flush;

        if (current == total) {
-             cleanup(this);
+            cleanup(this);
        }
    }

@@ -208,7 +222,7 @@ static bool common_pull_file(httplib::Client & cli,
    const char * func = __func__; // avoid __func__ inside a lambda
    size_t downloaded = existing_size;
    size_t progress_step = 0;
-    ProgressBar bar;
+    ProgressBar bar(resolve_path);

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
@@ -286,7 +300,7 @@ static int common_download_file_single_online(const std::string        & url,
    const bool file_exists = std::filesystem::exists(path);

    if (file_exists && skip_etag) {
-        LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
+        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
        return 304; // 304 Not Modified - fake cached response
    }

@@ -294,7 +308,7 @@ static int common_download_file_single_online(const std::string        & url,
    if (file_exists) {
        last_etag = read_etag(path);
    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+        LOG_DBG("%s: no previous model file found %s\n", __func__, path.c_str());
    }

    auto head = cli.Head(parts.path);
@@ -328,11 +342,11 @@ static int common_download_file_single_online(const std::string        & url,

    if (file_exists) {
        if (etag.empty()) {
-            LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
+            LOG_DBG("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (!last_etag.empty() && last_etag == etag) {
-            LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str());
+            LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (remove(path.c_str()) != 0) {
@@ -368,7 +382,7 @@ static int common_download_file_single_online(const std::string        & url,
            }
        }

-        LOG_INF("%s: downloading from %s to %s (etag:%s)...\n",
+        LOG_DBG("%s: downloading from %s to %s (etag:%s)...\n",
                __func__, common_http_show_masked_url(parts).c_str(),
                path_temporary.c_str(), etag.c_str());

@@ -437,7 +451,7 @@ int common_download_file_single(const std::string & url,
        return -1;
    }

-    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
+    LOG_DBG("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
    return 304; // Not Modified - fake cached response
 }

@@ -51,7 +51,7 @@ struct common_ngram_map_value {
 // statistics of a n-gram
 struct common_ngram_map_key {
    size_t   key_idx;   // index of key n-gram in token-history
-    size_t   stat_idx;  // index of last token of stastistics computation (key_num, values)
+    size_t   stat_idx;  // index of last token of statistics computation (key_num, values)

    uint16_t key_num;   // number of occurrences of this key n-gram in token-history
    common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
@@ -434,6 +434,9 @@ void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    ggml_tensor * src = dst->src[0];

+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);

@@ -456,6 +459,13 @@ void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    float          p_value  = 2.0f;
    acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
    GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
+
+    // Clamp norm to at least eps: scale = 1/fmaxf(norm, eps)
+    acl_scalar_ptr acl_min = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
+    float          flt_max = FLT_MAX;
+    acl_scalar_ptr acl_max = ggml_cann_create_scalar(&flt_max, aclDataType::ACL_FLOAT);
+    GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_div.get(), acl_min.get(), acl_max.get(), acl_div.get());
+
    GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
 }

@@ -216,14 +216,16 @@ struct ggml_cann_pool_alloc {
 #ifdef USE_ACL_GRAPH
 struct ggml_graph_node_properties {
    // dst tensor
-    void *  node_address;
-    int64_t ne[GGML_MAX_DIMS];
-    size_t  nb[GGML_MAX_DIMS];
+    void *    node_address;
+    ggml_type node_type;
+    int64_t   ne[GGML_MAX_DIMS];
+    size_t    nb[GGML_MAX_DIMS];

    // src tensor
-    void *  src_address[GGML_MAX_SRC];
-    int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
-    size_t  src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
+    void *    src_address[GGML_MAX_SRC];
+    ggml_type src_type[GGML_MAX_SRC];
+    int64_t   src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
+    size_t    src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];

    // op
    ggml_op node_op;
@@ -247,6 +249,10 @@ struct ggml_graph_node_properties {
            return false;
        }

+        if (node->type != this->node_type) {
+            return false;
+        }
+
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
            if (node->ne[i] != this->ne[i]) {
                return false;
@@ -262,6 +268,10 @@ struct ggml_graph_node_properties {
                    return false;
                }

+                if (node->src[i]->type != this->src_type[i]) {
+                    return false;
+                }
+
                for (int d = 0; d < GGML_MAX_DIMS; d++) {
                    if (node->src[i]->ne[d] != this->src_ne[i][d]) {
                        return false;
@@ -277,10 +287,7 @@ struct ggml_graph_node_properties {
            }
        }

-        if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU || node->op == GGML_OP_ROPE){
-            return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
-        }
-        return true;
+        return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
    }
 };

@@ -322,6 +329,7 @@ struct ggml_cann_graph {

            prop.node_address = node->data;
            prop.node_op      = node->op;
+            prop.node_type    = node->type;

            std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
            std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
@@ -329,10 +337,12 @@ struct ggml_cann_graph {
            for (int src = 0; src < GGML_MAX_SRC; ++src) {
                if (node->src[src]) {
                    prop.src_address[src] = node->src[src]->data;
+                    prop.src_type[src]    = node->src[src]->type;
                    std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
                    std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
                } else {
                    prop.src_address[src] = nullptr;
+                    prop.src_type[src]    = GGML_TYPE_COUNT;
                    std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
                    std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
                }
@@ -36,10 +36,13 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <memory>
 #include <mutex>
 #include <optional>
 #include <queue>
+#include <unordered_map>
 #include <unordered_set>
+#include <vector>

 #define GGML_COMMON_DECL_C

@@ -770,6 +773,21 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(i
 }

 // cann buffer
+
+/**
+ * @brief Tracks multi-threaded write progress for a single tensor.
+ *
+ * When multiple threads call set_tensor on different chunks of the same tensor,
+ * this tracker accumulates progress and defers post-processing (quantized format
+ * transform or ND-to-NZ conversion) until all data has been written.
+ */
+struct TensorSetTracker {
+    std::mutex mtx;                   ///< Protects concurrent access to this tracker
+    size_t bytes_written = 0;         ///< Accumulated bytes written so far
+    size_t total_bytes = 0;           ///< Target size (full tensor)
+    std::vector<uint8_t> host_buffer; ///< Host staging buffer for quantized tensors
+};
+
 /**
 * @brief Context for managing a CANN buffer associated with a specific device.
 *
@@ -780,6 +798,9 @@ struct ggml_backend_cann_buffer_context {
    int32_t device;             ///< The device ID associated with this buffer context.
    void *  dev_ptr = nullptr;  ///< Pointer to the device memory allocated for the buffer.

+    std::mutex tracker_mutex;   ///< Protects the trackers map
+    std::unordered_map<void *, std::unique_ptr<TensorSetTracker>> trackers;
+
    /**
     * @brief Constructor to initialize the CANN buffer context.
     *
@@ -792,6 +813,31 @@ struct ggml_backend_cann_buffer_context {
     * @brief Destructor to free the device memory allocated for the buffer.
     */
    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
+
+    /**
+     * @brief Get or create a tracker for the given tensor.
+     */
+    TensorSetTracker * get_or_create_tracker(ggml_tensor * tensor) {
+        std::lock_guard<std::mutex> lock(tracker_mutex);
+        auto key = tensor->data;
+        auto it = trackers.find(key);
+        if (it == trackers.end()) {
+            auto tracker = std::make_unique<TensorSetTracker>();
+            tracker->total_bytes = ggml_nbytes(tensor);
+            auto * ptr = tracker.get();
+            trackers[key] = std::move(tracker);
+            return ptr;
+        }
+        return it->second.get();
+    }
+
+    /**
+     * @brief Remove the tracker for the given tensor.
+     */
+    void remove_tracker(ggml_tensor * tensor) {
+        std::lock_guard<std::mutex> lock(tracker_mutex);
+        trackers.erase(tensor->data);
+    }
 };

 // cann buffer type
@@ -1124,6 +1170,7 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer
 * designed to be used with a global array, one per device.
 */
 struct ggml_cann_nz_workspace {
+    std::mutex mtx;    // Protects ptr/allocated from concurrent access
    void * ptr;        // Pointer to allocated device buffer
    size_t allocated;  // Size of currently allocated buffer in bytes

@@ -1190,13 +1237,15 @@ static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
 * @note The workspace buffer used in this function is managed globally and reused
 *       across calls. This reduces overhead from repeated memory allocation and deallocation.
 */
-static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
-    acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
+static void weight_format_to_nz(ggml_tensor * tensor, int device) {
+    acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, 0);
    uint64_t       workspaceSize    = 0;
    aclOpExecutor * executor;

    // TransMatmulWeight
    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
+
+    std::lock_guard<std::mutex> lock(g_nz_workspaces[device].mtx);
    // Avoid frequent malloc/free of the workspace.
    g_nz_workspaces[device].realloc(workspaceSize);

@@ -1210,7 +1259,13 @@ static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device)
 * @brief Set tensor data in a CANN buffer.
 *
 * This function sets tensor data in a CANN buffer, handling transformations
- * if needed based on the tensor's type.
+ * if needed based on the tensor's type. It supports multi-threaded calls
+ * where different threads write different chunks of the same tensor.
+ *
+ * For quantized tensors (Q4_0/Q8_0), data is staged in a host buffer and
+ * the format transform is deferred until all chunks are written.
+ * For NZ weight tensors, chunks are uploaded directly but the ND-to-NZ
+ * conversion is deferred until all chunks are written.
 *
 * @param buffer The CANN buffer where the tensor data will be set.
 * @param tensor Pointer to the tensor whose data will be set.
@@ -1226,26 +1281,72 @@ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;

    ggml_cann_set_device(ctx->device);
-    // TODO: refer to cann(#6017), it use thread's default stream.
-    // For acl, synchronous functions use this default stream.
-    // Why aclrtSynchronizeDevice?

    // Only check env once.
    static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
-    if (!need_transform(tensor->type)) {
+
+    bool is_quantized = need_transform(tensor->type);
+    bool is_nz        = !is_quantized && tensor->type != GGML_TYPE_BF16 && weight_to_nz &&
+                 is_matmul_weight((const ggml_tensor *) tensor);
+
+    // Plain tensor (not quantized, not NZ): direct copy, no tracking needed
+    if (!is_quantized && !is_nz) {
        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        if (weight_to_nz && tensor->type != GGML_TYPE_BF16
-            && is_matmul_weight((const ggml_tensor *) tensor)) {
+        return;
+    }
+
+    // Single-shot write (full tensor at once): handle directly without tracking overhead
+    if (offset == 0 && size == ggml_nbytes(tensor)) {
+        if (is_quantized) {
+            void * transform_buffer = malloc(size);
+            ggml_backend_cann_transform(tensor, data, transform_buffer);
+            ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
+            free(transform_buffer);
+        } else {
+            // NZ weight
            GGML_ASSERT(tensor->ne[2] == 1);
            GGML_ASSERT(tensor->ne[3] == 1);
-            weight_format_to_nz(tensor, offset, ctx->device);
+            ACL_CHECK(aclrtMemcpy(tensor->data, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+            weight_format_to_nz(tensor, ctx->device);
        }
-    } else {
-        void * transform_buffer = malloc(size);
-        ggml_backend_cann_transform(tensor, data, transform_buffer);
+        return;
+    }

-        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        free(transform_buffer);
+    // Chunked write: use tracker to accumulate progress and defer transform/conversion
+    TensorSetTracker * tracker = ctx->get_or_create_tracker(tensor);
+    std::unique_lock<std::mutex> lock(tracker->mtx);
+
+    if (is_quantized) {
+        // Stage data in host buffer; transform requires full tensor data
+        if (tracker->host_buffer.empty()) {
+            tracker->host_buffer.resize(tracker->total_bytes);
+        }
+        memcpy(tracker->host_buffer.data() + offset, data, size);
+    } else {
+        // NZ weight: upload chunk to device immediately, defer conversion
+        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    tracker->bytes_written += size;
+
+    // All chunks received: perform deferred transform/conversion
+    if (tracker->bytes_written >= tracker->total_bytes) {
+        if (is_quantized) {
+            void * transform_buffer = malloc(tracker->total_bytes);
+            ggml_backend_cann_transform(tensor, tracker->host_buffer.data(), transform_buffer);
+            ACL_CHECK(aclrtMemcpy(tensor->data, tracker->total_bytes, transform_buffer, tracker->total_bytes, ACL_MEMCPY_HOST_TO_DEVICE));
+            free(transform_buffer);
+        }
+
+        if (is_nz) {
+            GGML_ASSERT(tensor->ne[2] == 1);
+            GGML_ASSERT(tensor->ne[3] == 1);
+            weight_format_to_nz(tensor, ctx->device);
+        }
+
+        // Unlock before removing tracker, as remove_tracker destroys the mutex
+        lock.unlock();
+        ctx->remove_tracker(tensor);
    }
 }

@@ -294,7 +294,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
    }

    // get extra buffer types of the CPU
-    // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
+    // TODO: a more general solution for non-CPU extra buft should be implemented in the future
    //       ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
    std::vector<ggml_backend_buffer_type_t> buft_extra;
    {
@@ -18,7 +18,7 @@ struct llama_ubatch {
    }

    // typical for M-RoPE cases:
-    //   0 - sequantial position of the tokens/embeddings in the sequence
+    //   0 - sequential position of the tokens/embeddings in the sequence
    //   1 - y position in the image
    //   2 - x position in the image
    //   3 - other
@@ -586,7 +586,7 @@ void llama_context::sched_reserve() {

    // reserve again with pp graph to avoid ggml-alloc reallocations during inference
    {
-        // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
+        // TODO: not sure if the following graph would be worst case for multi-stream KV caches:
        //
        // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
        //
@@ -1665,7 +1665,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {

 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
    // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
-    //       but this would make the graph topology depend on the number of output tokens, which can interere with
+    //       but this would make the graph topology depend on the number of output tokens, which can interfere with
    //       features that require constant topology such as pipeline parallelism
    //       ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
    //if (n_outputs < n_tokens) {
@@ -333,7 +333,7 @@ public:
    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;

    // store k_cur and v_cur in the cache based on the provided head location
-    // note: the heads in k_cur and v_cur should be layed out contiguously in memory
+    // note: the heads in k_cur and v_cur should be laid out contiguously in memory
    //   - k_cur  [n_embd_head_k, n_head_k, n_tokens]
    //   - k_idxs [n_tokens]
    //   - v_cur  [n_embd_head_v, n_head_v, n_tokens]
@@ -9,7 +9,7 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,

    inpL = build_inp_embd(model.tok_embd);

-    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+    // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
    cb(inpL, "inp_scaled", -1);

@@ -9,7 +9,7 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr

    inpL = build_inp_embd(model.tok_embd);

-    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+    // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
    cb(inpL, "inp_scaled", -1);

@@ -12,7 +12,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const

    inpL = build_inp_embd(model.tok_embd);

-    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+    // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
    cb(inpL, "inp_scaled", -1);

@@ -3077,6 +3077,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .expect_reasoning("I need to output the invoice details in JSON")
            .expect_content(R"({"amount": 123.45, "date": "2025-12-03"})")
            .run();
+
+
+        // Unsolicited tool calls. There is no good way to handle these, so we return empty content.
+
+        // Builtin function - recipient in role
+        tst.test(
+               "<|channel|>analysis<|message|>I will execute python to say hello<|end|>"
+               "<|start|>assistant to=container.exec<|channel|>commentary<|message|>python3 -c 'print(\"hello\")'")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .expect_reasoning("I will execute python to say hello")
+            .expect_content("")
+            .run();
+
+        // Builtin function - recipient in channel
+        tst.test(
+               "<|channel|>analysis<|message|>I will execute python to say hello<|end|>"
+               "<|start|>assistant<|channel|>commentary to=python <|constrain|>code<|message|>print(\"hello\")")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .expect_reasoning("I will execute python to say hello")
+            .expect_content("")
+            .run();
    }

    {
@@ -42,7 +42,9 @@ option(LLAMA_BUILD_WEBUI "Build the embedded Web UI" ON)

 if (LLAMA_BUILD_WEBUI)
    set(PUBLIC_ASSETS
-        index.html.gz
+        index.html
+        bundle.js
+        bundle.css
        loading.html
    )

@@ -259,6 +259,6 @@ npm run test
 npm run build
 ```

-After `public/index.html.gz` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI.
+After `public/index.html` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI.

 **Note:** The Vite dev server automatically proxies API requests to `http://localhost:8080`. Make sure `llama-server` is running on that port during development.
@@ -10,7 +10,9 @@

 #ifdef LLAMA_BUILD_WEBUI
 // auto generated files (see README.md for details)
-#include "index.html.gz.hpp"
+#include "index.html.hpp"
+#include "bundle.js.hpp"
+#include "bundle.css.hpp"
 #include "loading.html.hpp"
 #endif

@@ -272,16 +274,19 @@ bool server_http_context::init(const common_params & params) {
        } else {
 #ifdef LLAMA_BUILD_WEBUI
            // using embedded static index.html
-            srv->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
-                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
-                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
-                } else {
-                    res.set_header("Content-Encoding", "gzip");
-                    // COEP and COOP headers, required by pyodide (python interpreter)
-                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
-                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
-                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
-                }
+            srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) {
+                // COEP and COOP headers, required by pyodide (python interpreter)
+                res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
+                return false;
+            });
+            srv->Get(params.api_prefix + "/bundle.js", [](const httplib::Request & /*req*/, httplib::Response & res) {
+                res.set_content(reinterpret_cast<const char*>(bundle_js), bundle_js_len, "application/javascript; charset=utf-8");
+                return false;
+            });
+            srv->Get(params.api_prefix + "/bundle.css", [](const httplib::Request & /*req*/, httplib::Response & res) {
+                res.set_content(reinterpret_cast<const char*>(bundle_css), bundle_css_len, "text/css; charset=utf-8");
                return false;
            });
 #endif
@@ -188,14 +188,14 @@ The build process:
 1. **Vite Build** - Bundles all TypeScript, Svelte, and CSS
 2. **Static Adapter** - Outputs to `../public` (llama-server's static file directory)
 3. **Post-Build Script** - Cleans up intermediate files
-4. **Custom Plugin** - Creates `index.html.gz` with:
+4. **Custom Plugin** - Creates `index.html` with:
   - Inlined favicon as base64
   - GZIP compression (level 9)
   - Deterministic output (zeroed timestamps)

 ```text
 tools/server/webui/        →  build  →  tools/server/public/
-├── src/                                 ├── index.html.gz  (served by llama-server)
+├── src/                                 ├── index.html  (served by llama-server)
 ├── static/                              └── (favicon inlined)
 └── ...
 ```
@@ -219,7 +219,7 @@ output: {

 The WebUI is embedded directly into the llama-server binary:

-1. `npm run build` outputs `index.html.gz` to `tools/server/public/`
+1. `npm run build` outputs `index.html` to `tools/server/public/`
 2. llama-server compiles this into the binary at build time
 3. When accessing `/`, llama-server serves the gzipped HTML
 4. All assets are inlined (CSS, JS, fonts, favicon)
@@ -50,7 +50,6 @@
 		"eslint-config-prettier": "^10.0.1",
 		"eslint-plugin-storybook": "^10.2.4",
 		"eslint-plugin-svelte": "^3.0.0",
-		"fflate": "^0.8.2",
 		"globals": "^16.0.0",
 		"http-server": "^14.1.1",
 		"mdast": "^3.0.0",
@@ -1,14 +1,12 @@
 #!/bin/bash

-# Script to install pre-commit and pre-push hooks for webui
-# Pre-commit: formats code and runs checks
-# Pre-push: builds the project, stashes unstaged changes
+# Script to install pre-commit hook for webui
+# Pre-commit: formats, checks, builds, and stages build output

 REPO_ROOT=$(git rev-parse --show-toplevel)
 PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit"
-PRE_PUSH_HOOK="$REPO_ROOT/.git/hooks/pre-push"

-echo "Installing pre-commit and pre-push hooks for webui..."
+echo "Installing pre-commit hook for webui..."

 # Create the pre-commit hook
 cat > "$PRE_COMMIT_HOOK" << 'EOF'
@@ -16,21 +14,19 @@ cat > "$PRE_COMMIT_HOOK" << 'EOF'

 # Check if there are any changes in the webui directory
 if git diff --cached --name-only | grep -q "^tools/server/webui/"; then
-    echo "Formatting and checking webui code..."
-    
-    # Change to webui directory and run format
-    cd tools/server/webui
-    
-    # Check if npm is available and package.json exists
+    REPO_ROOT=$(git rev-parse --show-toplevel)
+    cd "$REPO_ROOT/tools/server/webui"
+
+    # Check if package.json exists
    if [ ! -f "package.json" ]; then
        echo "Error: package.json not found in tools/server/webui"
        exit 1
    fi
-    
+
+    echo "Formatting and checking webui code..."
+
    # Run the format command
    npm run format
-
-    # Check if format command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run format failed"
        exit 1
@@ -38,8 +34,6 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then

    # Run the lint command
    npm run lint
-    
-    # Check if lint command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run lint failed"
        exit 1
@@ -47,156 +41,42 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then

    # Run the check command
    npm run check
-    
-    # Check if check command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run check failed"
        exit 1
    fi

-    # Go back to repo root
-    cd ../../..
-    
    echo "✅ Webui code formatted and checked successfully"
-fi

-exit 0
-EOF
-
-# Create the pre-push hook
-cat > "$PRE_PUSH_HOOK" << 'EOF'
-#!/bin/bash
-
-# Check if there are any webui changes that need building
-WEBUI_CHANGES=$(git diff --name-only @{push}..HEAD | grep "^tools/server/webui/" || true)
-
-if [ -n "$WEBUI_CHANGES" ]; then
-    echo "Webui changes detected, checking if build is up-to-date..."
-    
-    # Change to webui directory
-    cd tools/server/webui
-    
-    # Check if npm is available and package.json exists
-    if [ ! -f "package.json" ]; then
-        echo "Error: package.json not found in tools/server/webui"
+    # Build the webui
+    echo "Building webui..."
+    npm run build
+    if [ $? -ne 0 ]; then
+        echo "❌ npm run build failed"
        exit 1
    fi
-    
-    # Check if build output exists and is newer than source files
-    BUILD_FILE="../public/index.html.gz"
-    NEEDS_BUILD=false
-    
-    if [ ! -f "$BUILD_FILE" ]; then
-        echo "Build output not found, building..."
-        NEEDS_BUILD=true
-    else
-        # Check if any source files are newer than the build output
-        if find src -newer "$BUILD_FILE" -type f | head -1 | grep -q .; then
-            echo "Source files are newer than build output, rebuilding..."
-            NEEDS_BUILD=true
-        fi
-    fi
-    
-    if [ "$NEEDS_BUILD" = true ]; then
-        echo "Building webui..."
-        
-        # Stash any unstaged changes to avoid conflicts during build
-        echo "Checking for unstaged changes..."
-        if ! git diff --quiet || ! git diff --cached --quiet --diff-filter=A; then
-            echo "Stashing unstaged changes..."
-            git stash push --include-untracked -m "Pre-push hook: stashed unstaged changes"
-            STASH_CREATED=$?
-        else
-            echo "No unstaged changes to stash"
-            STASH_CREATED=1
-        fi
-        
-        # Run the build command
-        npm run build
-        
-        # Check if build command succeeded
-        if [ $? -ne 0 ]; then
-            echo "Error: npm run build failed"
-            if [ $STASH_CREATED -eq 0 ]; then
-                echo "You can restore your unstaged changes with: git stash pop"
-            fi
-            exit 1
-        fi

-        # Go back to repo root
-        cd ../../..
-        
-        # Check if build output was created/updated
-        if [ -f "tools/server/public/index.html.gz" ]; then
-            # Add the build output and commit it
-            git add tools/server/public/index.html.gz
-            if ! git diff --cached --quiet; then
-                echo "Committing updated build output..."
-                git commit -m "chore: update webui build output"
-                echo "✅ Build output committed successfully"
-            else
-                echo "Build output unchanged"
-            fi
-        else
-            echo "Error: Build output not found after build"
-            if [ $STASH_CREATED -eq 0 ]; then
-                echo "You can restore your unstaged changes with: git stash pop"
-            fi
-            exit 1
-        fi
-        
-        if [ $STASH_CREATED -eq 0 ]; then
-            echo "✅ Build completed. Your unstaged changes have been stashed."
-            echo "They will be automatically restored after the push."
-            # Create a marker file to indicate stash was created by pre-push hook
-            touch .git/WEBUI_PUSH_STASH_MARKER
-        fi
-    else
-        echo "✅ Build output is up-to-date"
-    fi
-    
-    echo "✅ Webui ready for push"
+    # Stage the build output alongside the source changes
+    cd "$REPO_ROOT"
+    git add tools/server/public/
+
+    echo "✅ Webui built and build output staged"
 fi

 exit 0
 EOF

-# Create the post-push hook (for restoring stashed changes after push)
-cat > "$REPO_ROOT/.git/hooks/post-push" << 'EOF'
-#!/bin/bash
-
-# Check if we have a stash marker from the pre-push hook
-if [ -f .git/WEBUI_PUSH_STASH_MARKER ]; then
-    echo "Restoring your unstaged changes after push..."
-    git stash pop
-    rm -f .git/WEBUI_PUSH_STASH_MARKER
-    echo "✅ Your unstaged changes have been restored."
-fi
-
-exit 0
-EOF
-
-# Make all hooks executable
+# Make hook executable
 chmod +x "$PRE_COMMIT_HOOK"
-chmod +x "$PRE_PUSH_HOOK"
-chmod +x "$REPO_ROOT/.git/hooks/post-push"

 if [ $? -eq 0 ]; then
-    echo "✅ Git hooks installed successfully!"
+    echo "✅ Git hook installed successfully!"
    echo "   Pre-commit: $PRE_COMMIT_HOOK"
-    echo "   Pre-push:   $PRE_PUSH_HOOK"
-    echo "   Post-push:  $REPO_ROOT/.git/hooks/post-push"
    echo ""
-    echo "The hooks will automatically:"
-    echo "  • Format and check webui code before commits (pre-commit)"
-    echo "  • Build webui code before pushes (pre-push)"
-    echo "  • Stash unstaged changes during build process"
-    echo "  • Restore your unstaged changes after the push"
-    echo ""
-    echo "To test the hooks:"
-    echo "  • Make a change to a file in the webui directory and commit it (triggers format/check)"
-    echo "  • Push your commits to trigger the build process"
+    echo "The hook will automatically:"
+    echo "  • Format, lint and check webui code before commits"
+    echo "  • Build webui and stage tools/server/public/ into the same commit"
 else
-    echo "❌ Failed to make hooks executable"
+    echo "❌ Failed to make hook executable"
    exit 1
 fi
@@ -1,3 +1,3 @@
 rm -rf ../public/_app;
 rm ../public/favicon.svg;
-rm ../public/index.html;
+rm -f ../public/index.html.gz; # deprecated, but may still be generated by older versions of the build process
@@ -40,6 +40,17 @@
 	--code-background: oklch(0.985 0 0);
 	--code-foreground: oklch(0.145 0 0);
 	--layer-popover: 1000000;
+
+	--chat-form-area-height: 8rem;
+	--chat-form-area-offset: 2rem;
+	--max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem)));
+}
+
+@media (min-width: 640px) {
+	:root {
+		--chat-form-area-height: 24rem;
+		--chat-form-area-offset: 12rem;
+	}
 }

 .dark {
@@ -116,19 +127,6 @@
 	--color-sidebar-ring: var(--sidebar-ring);
 }

-:root {
-	--chat-form-area-height: 8rem;
-	--chat-form-area-offset: 2rem;
-	--max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem)));
-}
-
-@media (min-width: 640px) {
-	:root {
-		--chat-form-area-height: 24rem;
-		--chat-form-area-offset: 12rem;
-	}
-}
-
@layer base {
 	* {
 		@apply border-border outline-ring/50;
@@ -21,7 +21,7 @@ const config = {
 			strict: true
 		}),
 		output: {
-			bundleStrategy: 'inline'
+			bundleStrategy: 'single'
 		},
 		alias: {
 			$styles: 'src/styles'
@@ -2,5 +2,5 @@ import { expect, test } from '@playwright/test';

 test('home page has expected h1', async ({ page }) => {
 	await page.goto('/');
-	await expect(page.locator('h1')).toBeVisible();
+	await expect(page.locator('h1').first()).toBeVisible();
 });
@@ -1,7 +1,6 @@
 import tailwindcss from '@tailwindcss/vite';
 import { sveltekit } from '@sveltejs/kit/vite';
-import * as fflate from 'fflate';
-import { readFileSync, writeFileSync, existsSync } from 'fs';
+import { readFileSync, writeFileSync, existsSync, readdirSync, copyFileSync } from 'fs';
 import { dirname, resolve } from 'path';
 import { fileURLToPath } from 'url';

@@ -20,15 +19,13 @@ const GUIDE_FOR_FRONTEND = `
 -->
 `.trim();

-const MAX_BUNDLE_SIZE = 2 * 1024 * 1024;
-
 /**
 * the maximum size of an embedded asset in bytes,
 * e.g. maximum size of embedded font (see node_modules/katex/dist/fonts/*.woff2)
 */
 const MAX_ASSET_SIZE = 32000;

-/** public/index.html.gz minified flag */
+/** public/index.html minified flag */
 const ENABLE_JS_MINIFICATION = true;

 function llamaCppBuildPlugin() {
@@ -40,7 +37,6 @@ function llamaCppBuildPlugin() {
 			setTimeout(() => {
 				try {
 					const indexPath = resolve('../public/index.html');
-					const gzipPath = resolve('../public/index.html.gz');

 					if (!existsSync(indexPath)) {
 						return;
@@ -61,26 +57,35 @@ function llamaCppBuildPlugin() {

 					content = content.replace(/\r/g, '');
 					content = GUIDE_FOR_FRONTEND + '\n' + content;
+					content = content.replace(/\/_app\/immutable\/bundle\.[^"]+\.js/g, './bundle.js');
+					content = content.replace(
+						/\/_app\/immutable\/assets\/bundle\.[^"]+\.css/g,
+						'./bundle.css'
+					);

-					const compressed = fflate.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
+					writeFileSync(indexPath, content, 'utf-8');
+					console.log('✓ Updated index.html');

-					compressed[0x4] = 0;
-					compressed[0x5] = 0;
-					compressed[0x6] = 0;
-					compressed[0x7] = 0;
-					compressed[0x9] = 0;
-
-					if (compressed.byteLength > MAX_BUNDLE_SIZE) {
-						throw new Error(
-							`Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
-								`Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.ts.\n`
-						);
+					// Copy bundle.*.js -> ../public/bundle.js
+					const immutableDir = resolve('../public/_app/immutable');
+					const bundleDir = resolve('../public/_app/immutable/assets');
+					if (existsSync(immutableDir)) {
+						const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/));
+						if (jsFiles.length > 0) {
+							copyFileSync(resolve(immutableDir, jsFiles[0]), resolve('../public/bundle.js'));
+							console.log(`✓ Copied ${jsFiles[0]} -> bundle.js`);
+						}
+					}
+					// Copy bundle.*.css -> ../public/bundle.css
+					if (existsSync(bundleDir)) {
+						const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/));
+						if (cssFiles.length > 0) {
+							copyFileSync(resolve(bundleDir, cssFiles[0]), resolve('../public/bundle.css'));
+							console.log(`✓ Copied ${cssFiles[0]} -> bundle.css`);
+						}
 					}
-
-					writeFileSync(gzipPath, compressed);
-					console.log('✓ Created index.html.gz');
 				} catch (error) {
-					console.error('Failed to create gzip file:', error);
+					console.error('Failed to update index.html:', error);
 				}
 			}, 100);
 		}
Author	SHA1	Message	Date
Adrien Gallouët	6307ec07d3	common : cleanup logs and modernize the progress bar (#21215 ) ``` $ build/bin/llama-server -hf unsloth/Qwen3.5-0.8B-GGUF common_download_file_single_online: HEAD failed, status: 404 no remote preset found, skipping Downloading mmproj-BF16.gguf ——————————————————————————————————————— 100% Downloading Qwen3.5-0.8B-Q4_K_M.gguf ——————————————————————————————— 100% ... ``` Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-03-31 16:18:00 +02:00
hipudding	632219af73	CANN: fix multi-thread set_tensor race conditions (#20151 ) * CANN: fix multi-thread set_tensor race conditions When ollama calls ggml_backend_tensor_set from multiple threads (each writing a different chunk of the same tensor), the CANN backend had three concurrency issues: 1. Quantized tensors (Q4_0/Q8_0) require a full-tensor format transform before uploading to device. Per-chunk transforms produced corrupt data. 2. ND-to-NZ weight conversion requires complete tensor data on device. Per-chunk conversion operated on incomplete data. 3. The global g_nz_workspaces array had unprotected concurrent access. Fix by introducing a TensorSetTracker that accumulates write progress per tensor. For quantized tensors, raw data is staged in a host buffer and the transform + upload is deferred until all chunks arrive. For NZ weights, chunks are uploaded directly but conversion is deferred. The tracker and its staging buffer are released immediately after post-processing completes. Add per-device mutex to g_nz_workspaces to prevent data races. * CANN: fix L2_NORM ignoring eps parameter The L2_NORM implementation was not using the eps parameter from op_params, causing incorrect results when eps is large (e.g. 10.0). The CPU reference computes scale = 1/fmaxf(norm, eps), so add a Clamp step to clamp the norm to at least eps before dividing. * ggml/cann: compare op_params for POOL_2D in ACL graph cache matching When ACL graph mode is enabled, the graph LRU cache checks whether a cached graph matches the current computation graph. Previously, GGML_OP_POOL_2D was not included in the op_params comparison, so two POOL_2D nodes with different pooling parameters (kernel size, stride, padding) but identical tensor shapes and addresses could incorrectly reuse a cached graph, leading to wrong results or aclnn errors. Add GGML_OP_POOL_2D to the list of ops that require op_params matching in ggml_graph_node_properties::has_matching_properties(). * cann: fix ACL graph cache matching by adding tensor type and unconditional op_params comparison The ACL graph LRU cache was incorrectly reusing cached graphs for operations with different tensor types or op_params, causing test failures for CPY (f16 vs bf16), POOL_2D, L2_NORM, NORM_MUL_ADD, RMS_NORM_MUL_ADD, and ADD_RMS_NORM. Changes: - Add node_type and src_type[] fields to ggml_graph_node_properties so the cache can distinguish tensors with different types but identical ne/nb (e.g. f16 and bf16 both have 2-byte elements) - Compare op_params unconditionally for all ops instead of only for SCALE/UNARY/GLU/ROPE/POOL_2D	2026-03-31 17:00:51 +03:00
Xuan-Son Nguyen	4a00bbfed6	server: (webui) no more gzip compression (#21073 ) * webui: no more gzip * try changing a small line * Revert "try changing a small line" This reverts commit `0d7a353159`. * fix lint * fix test * rebuild * split into html/css/js * lint * chore: update webui build output * chore: Update git hooks script * server: update webui build output * chore: Update pre-commit hook * refactor: Cleanup --------- Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>	2026-03-31 15:44:26 +02:00
Aldehir Rojas	624733d631	common : gpt-oss handle builtin and unsolicited tool calls (#21213 )	2026-03-31 13:52:42 +02:00
lainon1	0b6ff47996	fix: correct misspellings in code comments (#21217 ) - emdeddings → embeddings (gemma3.cpp, gemma3n-iswa.cpp, gemma-embedding.cpp) - imlpemented → implemented (llama-adapter.cpp) - interere → interfere (llama-graph.cpp) - overridde → overridden (chat.cpp) - stastistics → statistics (ngram-map.h) - layed → laid (llama-kv-cache.h) - worster → worst (llama-context.cpp) - sequantial → sequential (llama-batch.h)	2026-03-31 13:50:51 +02:00