grammar : use int64_t to avoid int overflows in int schema to grammar conversion logic (#16626 )

SYCL SET operator optimized for F32 tensors (#16350 )
* SYCL/SET: implement operator + wire-up; docs/ops updates; element_wise & ggml-sycl changes * sycl(SET): re-apply post-rebase; revert manual docs/ops.md; style cleanups * move SET op to standalone file, GPU-only implementation * Update SYCL SET operator for F32 * ci: fix editorconfig issues (LF endings, trailing spaces, final newline) * fixed ggml-sycl.cpp --------- Co-authored-by: Gitty Burstein <gitty@example.com>
2026-07-01 10:07:44 +02:00 · 2025-10-17 08:59:31 +03:00 · 2025-10-17 10:36:40 +08:00 · 2025-10-16 19:00:31 +02:00 · 2025-10-16 16:28:41 +02:00 · 2025-10-16 15:26:21 +02:00
53 changed files with 3609 additions and 2424 deletions
@@ -1760,7 +1760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
    add_opt(common_arg(
        {"-t", "--threads"}, "N",
-        string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
+        string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
        [](common_params & params, int value) {
            params.cpuparams.n_threads = value;
            if (params.cpuparams.n_threads <= 0) {
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }

-static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
-    auto has_min = min_value != std::numeric_limits<int>::min();
-    auto has_max = max_value != std::numeric_limits<int>::max();
+static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+    auto has_min = min_value != std::numeric_limits<int64_t>::min();
+    auto has_max = max_value != std::numeric_limits<int64_t>::max();

    auto digit_range = [&](char from, char to) {
        out << "[";
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
    if (has_min) {
        if (min_value < 0) {
            out << "\"-\" (";
-            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
            out << ") | [0] | [1-9] ";
            more_digits(0, decimals_left - 1);
        } else if (min_value == 0) {
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
            }
            digit_range(c, c);
            out << " (";
-            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
+            _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
            out << ")";
            if (c < '9') {
                out << " | ";
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
        } else {
            out << "\"-\" (";
-            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
            out << ")";
        }
        return;
@@ -925,17 +925,17 @@ public:
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
-            int min_value = std::numeric_limits<int>::min();
-            int max_value = std::numeric_limits<int>::max();
+            int64_t min_value = std::numeric_limits<int64_t>::min();
+            int64_t max_value = std::numeric_limits<int64_t>::max();
            if (schema.contains("minimum")) {
-                min_value = schema["minimum"].get<int>();
+                min_value = schema["minimum"].get<int64_t>();
            } else if (schema.contains("exclusiveMinimum")) {
-                min_value = schema["exclusiveMinimum"].get<int>() + 1;
+                min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
            }
            if (schema.contains("maximum")) {
-                max_value = schema["maximum"].get<int>();
+                max_value = schema["maximum"].get<int64_t>();
            } else if (schema.contains("exclusiveMaximum")) {
-                max_value = schema["exclusiveMaximum"].get<int>() - 1;
+                max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
            }
            std::stringstream out;
            out << "(";
@@ -22,6 +22,7 @@ Legend:
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
@@ -41,6 +42,7 @@ Legend:
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
@@ -82,6 +84,7 @@ Legend:
 |                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -108,5 +111,6 @@ Legend:
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                         TOPK_MOE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
 |                            XIELU | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -59,6 +59,14 @@
 "CPU","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
 "CPU","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","CPU"
 "CPU","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
+"CPU","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
+"CPU","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
+"CPU","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
+"CPU","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
 "CPU","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
 "CPU","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
 "CPU","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
@@ -119,6 +127,14 @@
 "CPU","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
 "CPU","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","CPU"
 "CPU","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
+"CPU","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
+"CPU","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
+"CPU","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
+"CPU","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
 "CPU","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","CPU"
 "CPU","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","CPU"
 "CPU","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","CPU"
@@ -577,6 +577,10 @@ extern "C" {
        GGML_UNARY_OP_EXP,
        GGML_UNARY_OP_GELU_ERF,
        GGML_UNARY_OP_XIELU,
+        GGML_UNARY_OP_FLOOR,
+        GGML_UNARY_OP_CEIL,
+        GGML_UNARY_OP_ROUND,
+        GGML_UNARY_OP_TRUNC,

        GGML_UNARY_OP_COUNT,
    };
@@ -1151,6 +1155,46 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_floor(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_floor_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_ceil(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_ceil_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_round(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_round_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+     /**
+     * Truncates the fractional part of each element in the tensor (towards zero).
+     * For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
+     * Similar to std::trunc in C/C++.
+     */
+
+    GGML_API struct ggml_tensor * ggml_trunc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_trunc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+
+
    // xIELU activation function
    // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
    // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
@@ -51,28 +51,31 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
    return ACL_DT_UNDEFINED;
 }

-aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
-                                   size_t* nb, int64_t dims, aclFormat format,
-                                   size_t offset) {
+aclTensor * ggml_cann_create_tensor(const ggml_tensor * tensor,
+                                    int64_t *           ne,
+                                    size_t *            nb,
+                                    int64_t             dims,
+                                    aclFormat           format,
+                                    size_t              offset) {
    // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
    // added.
    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];

    if (ne == nullptr) {
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
-            acl_ne[i] = tensor->ne[i];
+            acl_ne[i]     = tensor->ne[i];
            // The step size of acl is in elements.
            acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
        }
    } else {
        // With bcast
        for (int i = 0; i < dims; i++) {
-            acl_ne[i] = ne[i];
+            acl_ne[i]     = ne[i];
            acl_stride[i] = nb[i] / ggml_element_size(tensor);
        }
    }

-    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
+    int64_t final_dims      = (dims == 0 ? GGML_MAX_DIMS : dims);
    int64_t acl_storage_len = 1;
    for (int i = 0; i < final_dims; i++) {
        acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
@@ -84,15 +87,13 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    std::reverse(acl_ne, acl_ne + final_dims);
    std::reverse(acl_stride, acl_stride + final_dims);

-    aclTensor* acl_tensor = aclCreateTensor(
-        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
-        elem_offset, format, &acl_storage_len, 1,
-        tensor->data);
+    aclTensor * acl_tensor = aclCreateTensor(acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
+                                             elem_offset, format, &acl_storage_len, 1, tensor->data);

    return acl_tensor;
 }

-bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
+bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1) {
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
            return true;
@@ -101,15 +102,16 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
    return false;
 }

-int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
-                                  const ggml_tensor* src1,
-                                  int64_t* bcast_src0_ne,
-                                  int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
-                                  size_t* bcast_src1_nb) {
+int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0,
+                                  const ggml_tensor * src1,
+                                  int64_t *           bcast_src0_ne,
+                                  int64_t *           bcast_src1_ne,
+                                  size_t *            bcast_src0_nb,
+                                  size_t *            bcast_src1_nb) {
    GGML_ASSERT(ggml_can_repeat(src1, src0));
    int bcast_dim_cnt = 0;
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        int64_t nr = src0->ne[i] / src1->ne[i];
+        int64_t nr                   = src0->ne[i] / src1->ne[i];
        bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
        bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
        bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
@@ -119,21 +121,26 @@ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
            // Need to add an extra dim.
            bcast_src0_ne[bcast_dim_cnt] = nr;
            bcast_src1_ne[bcast_dim_cnt] = 1;
-            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
-                                           bcast_src0_ne[bcast_dim_cnt - 1];
-            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
-                                           bcast_src1_ne[bcast_dim_cnt - 1];
+            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] * bcast_src0_ne[bcast_dim_cnt - 1];
+            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] * bcast_src1_ne[bcast_dim_cnt - 1];
            bcast_dim_cnt++;
        }
    }
    return bcast_dim_cnt;
 }

-int64_t ggml_cann_get_mulmat_bcast_shape(
-    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
-    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
-    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
-    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
+int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne,
+                                         const int64_t * weight_ne,
+                                         const int64_t * dst_ne,
+                                         const size_t *  input_nb,
+                                         const size_t *  weight_nb,
+                                         const size_t *  dst_nb,
+                                         int64_t *       bcast_input_ne,
+                                         int64_t *       bcast_weight_ne,
+                                         int64_t *       bcast_dst_ne,
+                                         size_t *        bcast_input_nb,
+                                         size_t *        bcast_weight_nb,
+                                         size_t *        bcast_dst_nb) {
    // input and dst shoule in same shape, except first two dims.
    GGML_ASSERT(input_ne[2] == dst_ne[2]);
    GGML_ASSERT(input_ne[3] == dst_ne[3]);
@@ -148,34 +155,30 @@ int64_t ggml_cann_get_mulmat_bcast_shape(
        // Do not use bcast in the first two dimensions because we only support
        // the bcast batch dimension. Just copy them.
        if (i < 2 || nr == 1) {
-            bcast_input_ne[bcast_dim_cnt] = input_ne[i];
+            bcast_input_ne[bcast_dim_cnt]  = input_ne[i];
            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
-            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
+            bcast_dst_ne[bcast_dim_cnt]    = dst_ne[i];

-            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
+            bcast_input_nb[bcast_dim_cnt]  = input_nb[i];
            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
-            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+            bcast_dst_nb[bcast_dim_cnt]    = dst_nb[i];
            bcast_dim_cnt++;
        } else {
            // Need to add an extra dim.
-            bcast_input_ne[bcast_dim_cnt] = nr;
-            bcast_dst_ne[bcast_dim_cnt] = nr;
+            bcast_input_ne[bcast_dim_cnt]  = nr;
+            bcast_dst_ne[bcast_dim_cnt]    = nr;
            bcast_weight_ne[bcast_dim_cnt] = 1;
-            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
-            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+            bcast_input_nb[bcast_dim_cnt]  = input_nb[i];
+            bcast_dst_nb[bcast_dim_cnt]    = dst_nb[i];
            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
            bcast_dim_cnt++;

-            bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
-            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
+            bcast_input_ne[bcast_dim_cnt]  = input_ne[i] / nr;
+            bcast_dst_ne[bcast_dim_cnt]    = dst_ne[i] / nr;
            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
-            bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
-                                            bcast_input_ne[bcast_dim_cnt - 1];
-            bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
-                                          bcast_dst_ne[bcast_dim_cnt - 1];
-            bcast_weight_nb[bcast_dim_cnt] =
-                bcast_weight_nb[bcast_dim_cnt - 1] *
-                bcast_weight_ne[bcast_dim_cnt - 1];
+            bcast_input_nb[bcast_dim_cnt]  = bcast_input_nb[bcast_dim_cnt - 1] * bcast_input_ne[bcast_dim_cnt - 1];
+            bcast_dst_nb[bcast_dim_cnt]    = bcast_dst_nb[bcast_dim_cnt - 1] * bcast_dst_ne[bcast_dim_cnt - 1];
+            bcast_weight_nb[bcast_dim_cnt] = bcast_weight_nb[bcast_dim_cnt - 1] * bcast_weight_ne[bcast_dim_cnt - 1];
            bcast_dim_cnt++;
        }
    }
@@ -62,10 +62,12 @@ aclDataType ggml_cann_type_mapping(ggml_type type);
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
-aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
-                             size_t* nb = nullptr, int64_t dims = 0,
-                             aclFormat format = ACL_FORMAT_ND,
-                             size_t offset = 0);
+aclTensor * ggml_cann_create_tensor(const ggml_tensor * tensor,
+                                    int64_t *           ne     = nullptr,
+                                    size_t *            nb     = nullptr,
+                                    int64_t             dims   = 0,
+                                    aclFormat           format = ACL_FORMAT_ND,
+                                    size_t              offset = 0);

 /**
 * @brief   Template for creating an ACL tensor from provided parameters. typename TYPE
@@ -87,12 +89,15 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
-template<typename TYPE>
-aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
-                                   TYPE type_size, int64_t* ne, TYPE* nb,
-                                   int64_t dims,
-                                   aclFormat format = ACL_FORMAT_ND,
-                                   size_t offset = 0) {
+template <typename TYPE>
+aclTensor * ggml_cann_create_tensor(void *      data_ptr,
+                                    aclDataType dtype,
+                                    TYPE        type_size,
+                                    int64_t *   ne,
+                                    TYPE *      nb,
+                                    int64_t     dims,
+                                    aclFormat   format = ACL_FORMAT_ND,
+                                    size_t      offset = 0) {
    int64_t tmp_ne[GGML_MAX_DIMS * 2];
    int64_t tmp_stride[GGML_MAX_DIMS * 2];

@@ -109,9 +114,8 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
    std::reverse(tmp_ne, tmp_ne + dims);
    std::reverse(tmp_stride, tmp_stride + dims);

-    aclTensor* acl_tensor =
-        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
-                        format, &acl_storage_len, 1, data_ptr);
+    aclTensor * acl_tensor =
+        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, format, &acl_storage_len, 1, data_ptr);

    return acl_tensor;
 }
@@ -132,7 +136,7 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
 *          to 1. If such a dimension is found, broadcasting is required to align t1
 *          with t0 for element-wise operations.
 */
-bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
+bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1);

 /**
 * @brief   Computes broadcast shapes and strides for two ggml_tensors.
@@ -187,19 +191,21 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
 *  dim1 in a inserted dim, should add nb for dim1,
 *  and all other nb moves to next in order.
 */
-int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
-                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
-                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
+int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0,
+                                  const ggml_tensor * src1,
+                                  int64_t *           bcast_ne_src0,
+                                  int64_t *           bcast_ne_src1,
+                                  size_t *            bcast_nb_src0,
+                                  size_t *            bcast_nb_src1);

 // Bcast macro to avoid duplicate code.
-#define BCAST_SHAPE(src0, src1)                                              \
-    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                            \
-    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                            \
-    size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2];                             \
-    size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2];                             \
-    int64_t bcast_dims = ggml_cann_get_bcast_shape(                          \
-        src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
-        bcast_##src1##_nb);
+#define BCAST_SHAPE(src0, src1)                                                                      \
+    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                                                    \
+    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                                                    \
+    size_t  bcast_##src0##_nb[GGML_MAX_DIMS * 2];                                                    \
+    size_t  bcast_##src1##_nb[GGML_MAX_DIMS * 2];                                                    \
+    int64_t bcast_dims = ggml_cann_get_bcast_shape(src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, \
+                                                   bcast_##src0##_nb, bcast_##src1##_nb);

 #define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims

@@ -233,26 +239,31 @@ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* sr
 *       before cast dim.
 * @sa ggml_cann_get_bcast_shape
 */
-int64_t ggml_cann_get_mulmat_bcast_shape(
-    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
-    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
-    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
-    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
+int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne,
+                                         const int64_t * weight_ne,
+                                         const int64_t * dst_ne,
+                                         const size_t *  input_nb,
+                                         const size_t *  weight_nb,
+                                         const size_t *  dst_nb,
+                                         int64_t *       bcast_input_ne,
+                                         int64_t *       bcast_weight_ne,
+                                         int64_t *       bcast_dst_ne,
+                                         size_t *        bcast_input_nb,
+                                         size_t *        bcast_weight_nb,
+                                         size_t *        bcast_dst_nb);

 // Bcast macro to avoid duplicate code.
-#define BCAST_MUL_MAT_SHAPE(input, weight, dst)                         \
-    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                      \
-    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                     \
-    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                        \
-    size_t bcast_##input##_nb[GGML_MAX_DIMS * 2];                       \
-    size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2];                      \
-    size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2];                         \
-    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(              \
-        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
-        bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne,      \
-        bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
+#define BCAST_MUL_MAT_SHAPE(input, weight, dst)                                                                  \
+    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                                                               \
+    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                                                              \
+    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                                                                 \
+    size_t  bcast_##input##_nb[GGML_MAX_DIMS * 2];                                                               \
+    size_t  bcast_##weight##_nb[GGML_MAX_DIMS * 2];                                                              \
+    size_t  bcast_##dst##_nb[GGML_MAX_DIMS * 2];                                                                 \
+    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(                                                       \
+        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, bcast_##input##_ne, bcast_##weight##_ne, \
+        bcast_##dst##_ne, bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);

-#define BCAST_MUL_MAT_PARAM(tensor) \
-    bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
+#define BCAST_MUL_MAT_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims

 #endif  // CANN_ACL_TENSOR_H
@@ -62,7 +62,7 @@
 * @param   dst The ggml tensor representing the destination, which op is
 *              GGML_OP_REPEAT and specifies the desired dimensions.
 */
-void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
@@ -82,7 +82,7 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result of the Leaky ReLU
 *            activation is stored, which op is `GGML_OP_LEAKY_RELU`
 */
-void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief    Concatenates multiple tensors along a specified dimension using the
@@ -97,7 +97,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @attention tensorList length should be 2 and the dimension using for concat
 *            default to 1.
 */
-void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Generates a sequence of evenly spaced values within a specified
@@ -113,7 +113,7 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *            `start`, 'stop' and 'step' are in dst->op_params and dst->op is
 *            `GGML_OP_ARANGE`.
 */
-void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies a clamp operation to the elements of a ggml tensor using the
@@ -131,7 +131,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the clamped values will be stored.
 *            dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
 */
-void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Scales the elements of a ggml tensor by a constant factor using the
@@ -148,7 +148,7 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the scaled values will be stored.
 *            dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
 */
-void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Sorts the elements of a ggml tensor and returns the indices that
@@ -163,7 +163,7 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the sorted indices will be stored.
 *            dst->op is `GGML_OP_ARGSORT`.
 */
-void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the Layer Normalization for a ggml tensor using the CANN
@@ -185,7 +185,7 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the normalized values will be stored.
 * @attention `Var` defaults to dst->ne[0].
 */
-void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief  Computes the Group Normalization for a ggml tensor using the CANN
@@ -209,7 +209,7 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 * @attention eps defaults to 1e-6f.
 */
-void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the accumulation of tensors using the CANN backend.
@@ -228,7 +228,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the accumulated values will be stored.
 *            `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
 */
-void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the sum of elements along the last dimension of a ggml tensor
@@ -244,7 +244,7 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 * @attention `reduce_dims` defaults to 3, which means the last dimension.
 */
-void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the sum of elements in a ggml tensor.
@@ -258,7 +258,7 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 */

-void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
@@ -274,8 +274,7 @@ void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the upsampled values will be stored.
 *            dst->op is `GGML_OP_UPSCALE`.
 */
-void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
-                                  ggml_tensor* dst);
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Pads a ggml tensor to match the dimensions of the destination tensor
@@ -290,7 +289,7 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
 * @param dst The destination tensor, which specifies the target dimensions for
 *            padding. dst->op is `GGML_OP_PAD`.
 */
-void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Executes a 2D pooling operation on a ggml tensor using the CANN
@@ -307,7 +306,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor on which the pooling operation is to be
 *            performed. dst->op is `GGML_OP_POOL_2D`.
 */
-void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Duplicates a ggml tensor using the CANN backend.
@@ -326,7 +325,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *            different shape and dst is no-contiguous.
 * @note:     This func need to simplify.
 */
-void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the Root Mean Square (RMS) normalization of a ggml tensor
@@ -348,7 +347,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the normalized values will be stored.
 *            dst->op is `GGML_OP_RMS_NORM`.
 */
-void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies a diagonal mask to the tensor with a specified value.
@@ -363,7 +362,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *            `GGML_OP_DIAG_MASK`
 * @param value The value to use for masking.
 */
-void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
+void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value);

 /**
 * @brief   Performs an image-to-column transformation on the input tensor.
@@ -378,7 +377,7 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float
 * @param dst The destination tensor that stores the result of the operation.
 *            dst->op is `GGML_OP_IM2COL`.
 */
-void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes time step embeddings using sine and cosine functions.
@@ -392,10 +391,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result of the embedding operation
 *            will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
 */
-void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 // @see ggml_cann_dup.
-void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the softmax activation with optional masking.
@@ -417,7 +416,7 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result will be stored. dst->op is
 *            `GGML_OP_SOFTMAX`.
 */
-void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Extracts specific rows from a tensor based on indices.
@@ -429,7 +428,7 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the extracted rows will be stored.
 */
-void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Writes specific rows into a tensor at positions specified by indices.
@@ -441,7 +440,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the specified rows will be updated.
 */
-void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Executes matrix multiplication for the given tensor.
@@ -454,7 +453,7 @@ void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor for storing the result of the matrix
 *            multiplication. dst->op is `GGML_OP_MUL_MAT`.
 */
-void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
@@ -477,7 +476,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @note The function currently does not support cases where the freq_scale is
 *       not equal 1.
 */
-void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the index of the maximum value along the specified dimension
@@ -492,7 +491,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the indices of the maximum values will
 *            be stored. dst->op is `GGML_OP_ARGMAX`.
 */
-void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief Adds two tensors element-wise and stores the result in a destination
@@ -509,8 +508,10 @@ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param acl_src1 The second source tensor.
 * @param acl_dst The destination tensor where the result will be stored.
 */
-void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
-    aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
+void aclnn_add(ggml_backend_cann_context & ctx,
+               aclTensor *                 acl_src0,
+               aclTensor *                 acl_src1,
+               aclTensor *                 acl_dst = nullptr);

 /**
 * @brief Sub two tensors element-wise and stores the result in a destination
@@ -527,8 +528,10 @@ void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
 * @param acl_src1 The second source tensor.
 * @param acl_dst The destination tensor where the result will be stored.
 */
-void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
-    aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
+void aclnn_sub(ggml_backend_cann_context & ctx,
+               aclTensor *                 acl_src0,
+               aclTensor *                 acl_src1,
+               aclTensor *                 acl_dst = nullptr);

 /**
 * @brief Performs element-wise multiplication of two tensors and stores the
@@ -546,8 +549,10 @@ void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
 * @param acl_other The second tensor for element-wise multiplication.
 * @param acl_dst The destination tensor where the result will be stored.
 */
-void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_other, aclTensor* acl_dst = nullptr);
+void aclnn_mul(ggml_backend_cann_context & ctx,
+               aclTensor *                 acl_src,
+               aclTensor *                 acl_other,
+               aclTensor *                 acl_dst = nullptr);

 /**
 * @brief Matrix division, optionally in-place.
@@ -567,8 +572,10 @@ void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 * @param inplace Flag indicating whether to perform the operation in-place on
 * `acl_src`.
 */
-void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_other, aclTensor* acl_dst = nullptr);
+void aclnn_div(ggml_backend_cann_context & ctx,
+               aclTensor *                 acl_src,
+               aclTensor *                 acl_other,
+               aclTensor *                 acl_dst = nullptr);

 /**
 * @brief Applies element-wise cosine function to the elements of a tensor.
@@ -584,8 +591,7 @@ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 * @param acl_dst The destination tensor where the cosine results will be
 * stored.
 */
-void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_dst);
+void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);

 /**
 * @brief Applies element-wise sine function to the elements of a tensor.
@@ -602,8 +608,7 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 * @param acl_src The source tensor on which the sine function will be applied.
 * @param acl_dst The destination tensor where the sine results will be stored.
 */
-void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_dst);
+void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);

 /**
 * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
@@ -621,8 +626,12 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
 * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
 */
-void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
-    aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
+void bcast_shape(ggml_tensor * src0,
+                 ggml_tensor * src1,
+                 ggml_tensor * dst,
+                 aclTensor **  acl_src0,
+                 aclTensor **  acl_src1,
+                 aclTensor **  acl_dst);

 /**
 * @brief   Computes the 1D transposed convolution (deconvolution) of a ggml
@@ -637,7 +646,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
 * @param dst The destination tensor where the transposed convolution result
 * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
 */
-void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
@@ -662,7 +671,7 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
 * @param dst The destination tensor where the ELU-activated result will be stored.
 *            dst->op is expected to be `GGML_OP_ELU`.
 */
-void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the mean of a ggml tensor element-wise using the CANN backend.
@@ -677,7 +686,7 @@ void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the mean result will be stored.
 *            dst->op is expected to be `GGML_OP_MEAN`.
 */
-void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies 1D reflect padding to a ggml tensor using the CANN backend.
@@ -692,7 +701,7 @@ void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the padded result will be stored.
 *            dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`.
 */
-void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Counts the number of equal elements in two ggml tensors using the CANN backend.
@@ -708,7 +717,7 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result will be stored.
 *            dst->op is expected to be `GGML_OP_COUNT_EQUAL`.
 */
-void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies the Step activation function to a ggml tensor using the CANN backend.
@@ -723,7 +732,7 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result will be stored.
 *            dst->op is expected to be `GGML_OP_STEP`.
 */
-void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Performs the Flash Attention extended operator using the CANN backend.
@@ -738,59 +747,46 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result will be stored.
 *            dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
 */
-void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /*
 * @brief A generic wrapper for ACL resources with custom deleter support.
 */
-using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
+using any_acl_resource = std::unique_ptr<void, std::function<void(void *)>>;

 /**
 * @brief Trait structure used to define how to destroy a given ACL resource type.
 *
 * @tparam T ACL resource type.
 */
-template<typename T>
-struct acl_resource_traits;
+template <typename T> struct acl_resource_traits;

 /**
 * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
 */
-template<>
-struct acl_resource_traits<aclTensor> {
-    static void destroy(void* p) {
-        ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
-    }
+template <> struct acl_resource_traits<aclTensor> {
+    static void destroy(void * p) { ACL_CHECK(aclDestroyTensor(static_cast<aclTensor *>(p))); }
 };

 /**
 * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
 */
-template<>
-struct acl_resource_traits<aclIntArray> {
-    static void destroy(void* p) {
-        ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
-    }
+template <> struct acl_resource_traits<aclIntArray> {
+    static void destroy(void * p) { ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray *>(p))); }
 };

 /**
 * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
 */
-template<>
-struct acl_resource_traits<aclScalar> {
-    static void destroy(void* p) {
-        ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
-    }
+template <> struct acl_resource_traits<aclScalar> {
+    static void destroy(void * p) { ACL_CHECK(aclDestroyScalar(static_cast<aclScalar *>(p))); }
 };

 /**
 * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
 */
-template<>
-struct acl_resource_traits<aclTensorList> {
-    static void destroy(void* p) {
-        ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
-    }
+template <> struct acl_resource_traits<aclTensorList> {
+    static void destroy(void * p) { ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList *>(p))); }
 };

 /**
@@ -800,14 +796,8 @@ struct acl_resource_traits<aclTensorList> {
 * @param ptr Raw pointer to ACL resource.
 * @return any_acl_resource Smart pointer that handles destruction.
 */
-template<typename T>
-any_acl_resource make_acl_resource(T* ptr) {
-    return any_acl_resource(
-        static_cast<void*>(ptr),
-        [](void* p) {
-            acl_resource_traits<T>::destroy(p);
-        }
-    );
+template <typename T> any_acl_resource make_acl_resource(T * ptr) {
+    return any_acl_resource(static_cast<void *>(ptr), [](void * p) { acl_resource_traits<T>::destroy(p); });
 }

 /**
@@ -817,8 +807,7 @@ any_acl_resource make_acl_resource(T* ptr) {
 * @param vec Target vector to hold ACL resources.
 * @param args Raw pointers to ACL resources.
 */
-template<typename... Args>
-void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
+template <typename... Args> void register_acl_resources(std::vector<any_acl_resource> & vec, Args *... args) {
    (vec.emplace_back(make_acl_resource(args)), ...);
 }

@@ -826,39 +815,36 @@ void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
 * @brief Task class that wraps the execution of an aclnn function call.
 */
 class aclnn_task : public cann_task {
-    public:
-        aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
-                   uint64_t workspace_size, aclOpExecutor * executor,
-                   aclrtStream stream) :
-            aclnn_func_(aclnn_func),
-            workspace_addr_(workspace_addr),
-            workspace_size_(workspace_size),
-            executor_(executor),
-            stream_(stream) {}
-        virtual void run_task() override {
-            ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
-        }
-    private:
-        aclnn_func_t aclnn_func_;
-        void *          workspace_addr_;
-        uint64_t        workspace_size_;
-        aclOpExecutor * executor_;
-        aclrtStream     stream_;
+  public:
+    aclnn_task(aclnn_func_t    aclnn_func,
+               void *          workspace_addr,
+               uint64_t        workspace_size,
+               aclOpExecutor * executor,
+               aclrtStream     stream) :
+        aclnn_func_(aclnn_func),
+        workspace_addr_(workspace_addr),
+        workspace_size_(workspace_size),
+        executor_(executor),
+        stream_(stream) {}
+
+    virtual void run_task() override { ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_)); }
+  private:
+    aclnn_func_t    aclnn_func_;
+    void *          workspace_addr_;
+    uint64_t        workspace_size_;
+    aclOpExecutor * executor_;
+    aclrtStream     stream_;
 };

 /**
 * @brief Task class that releases ACL resources after usage.
 */
 class release_resource_task : public cann_task {
-public:
-    release_resource_task(std::vector<any_acl_resource>&& resources){
-        resource_ = std::move(resources);
-    }
+  public:
+    release_resource_task(std::vector<any_acl_resource> && resources) { resource_ = std::move(resources); }

-    virtual void run_task() override {
-        resource_.clear();
-    }
-private:
+    virtual void run_task() override { resource_.clear(); }
+  private:
    std::vector<any_acl_resource> resource_;
 };

@@ -866,38 +852,40 @@ private:
 * @brief Task class for performing asynchronous memory copy operations.
 */
 class async_memcpy_task : public cann_task {
-public:
-    async_memcpy_task(void* dst, const void* src, size_t size,
-                      aclrtMemcpyKind kind, aclrtStream stream)
-        : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
+  public:
+    async_memcpy_task(void * dst, const void * src, size_t size, aclrtMemcpyKind kind, aclrtStream stream) :
+        dst_(dst),
+        src_(src),
+        size_(size),
+        kind_(kind),
+        stream_(stream) {}

-    virtual void run_task() override {
-        ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
-    }
-private:
-    void* dst_;
-    const void* src_;
-    size_t size_;
+    virtual void run_task() override { ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_)); }
+  private:
+    void *          dst_;
+    const void *    src_;
+    size_t          size_;
    aclrtMemcpyKind kind_;
-    aclrtStream stream_;
+    aclrtStream     stream_;
 };

 /**
 * @brief Task class for performing asynchronous memory set operations.
 */
 class async_memset_task : public cann_task {
-    public:
-    async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
-            : buffer_(buffer), size_(size), value_(value), stream_(stream) {}
+  public:
+    async_memset_task(void * buffer, size_t size, int32_t value, aclrtStream stream) :
+        buffer_(buffer),
+        size_(size),
+        value_(value),
+        stream_(stream) {}

-        virtual void run_task() override {
-            ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
-        }
-    private:
-        void* buffer_;
-        size_t size_;
-        int32_t value_;
-        aclrtStream stream_;
+    virtual void run_task() override { ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_)); }
+  private:
+    void *      buffer_;
+    size_t      size_;
+    int32_t     value_;
+    aclrtStream stream_;
 };

 /**
@@ -918,25 +906,24 @@ class async_memset_task : public cann_task {
 * same stream are executed in queue order.
 */

-#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...)                                          \
-    do {                                                                                    \
-        uint64_t        workspaceSize = 0;                                                  \
-        aclOpExecutor * executor;                                                           \
-        void *          workspaceAddr = nullptr;                                            \
-        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
-        /* workspace should alloced in main thread to keep malloc order when using vmm. */  \
-        if (workspaceSize > 0) {                                                            \
-            ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize);            \
-            workspaceAddr = workspace_allocator.get();                                      \
-        }                                                                                   \
-        if (CTX.async_mode) {                                                               \
-            auto task =                                                                     \
-                std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize,  \
-                    executor, CTX.stream()); \
-            CTX.task_queue.submit_task(std::move(task));                                    \
-        } else {                                                                            \
-            ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
-        }                                                                                   \
+#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...)                                                                  \
+    do {                                                                                                            \
+        uint64_t        workspaceSize = 0;                                                                          \
+        aclOpExecutor * executor;                                                                                   \
+        void *          workspaceAddr = nullptr;                                                                    \
+        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));                        \
+        /* workspace should alloced in main thread to keep malloc order when using vmm. */                          \
+        if (workspaceSize > 0) {                                                                                    \
+            ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize);                                    \
+            workspaceAddr = workspace_allocator.get();                                                              \
+        }                                                                                                           \
+        if (CTX.async_mode) {                                                                                       \
+            auto task =                                                                                             \
+                std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, executor, CTX.stream()); \
+            CTX.task_queue.submit_task(std::move(task));                                                            \
+        } else {                                                                                                    \
+            ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));                        \
+        }                                                                                                           \
    } while (0)

 /**
@@ -947,11 +934,10 @@ class async_memset_task : public cann_task {
 * @param ctx Backend context which manages task submission and async mode.
 * @param args Pointers to ACL resources to be released.
 */
-template <typename... Args>
-void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
+template <typename... Args> void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
    std::vector<any_acl_resource> resources;
    register_acl_resources(resources, std::forward<Args>(args)...);
-    if(ctx.async_mode) {
+    if (ctx.async_mode) {
        auto task = std::make_unique<release_resource_task>(std::move(resources));
        ctx.task_queue.submit_task(std::move(task));
    }
@@ -966,8 +952,11 @@ void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... arg
 * @param len Size of memory to copy (in bytes).
 * @param kind Type of memory copy (host-to-device, device-to-host, etc).
 */
-inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
-                                   const void * src, size_t len, aclrtMemcpyKind kind) {
+inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx,
+                                   void *                      dst,
+                                   const void *                src,
+                                   size_t                      len,
+                                   aclrtMemcpyKind             kind) {
    if (ctx.async_mode) {
        auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
        ctx.task_queue.submit_task(std::move(task));
@@ -976,8 +965,11 @@ inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
    }
 }

-inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
-                                   const void * src, size_t len, aclrtMemcpyKind kind) {
+inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx,
+                                   void *                      dst,
+                                   const void *                src,
+                                   size_t                      len,
+                                   aclrtMemcpyKind             kind) {
    if (ctx->async_mode) {
        auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
        ctx->task_queue.submit_task(std::move(task));
@@ -994,8 +986,7 @@ inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
 * @param size Size of the memory buffer (in bytes).
 * @param value Value to set in the buffer.
 */
-inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
-                                   size_t size, int value) {
+inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer, size_t size, int value) {
    if (ctx.async_mode) {
        auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
        ctx.task_queue.submit_task(std::move(task));
@@ -1029,7 +1020,7 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
 * @param dst The destination tensor where the expert-weighted token outputs are stored.
 *            Expected to be of shape [M, K, N, 1].
 */
-void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Check whether a tensor is a weight tensor for matrix multiplication.
@@ -1041,20 +1032,14 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 * @param tensor Pointer to the target ggml_tensor object (const-qualified).
 */
-static bool is_matmul_weight(const ggml_tensor* tensor) {
-    std::string name = ggml_get_name(tensor);
-    static const std::unordered_set<std::string> weight_suffixes{
-        "output.weight",
-        "attn_q.weight",
-        "attn_k.weight",
-        "attn_v.weight",
-        "attn_output.weight",
-        "ffn_gate.weight",
-        "ffn_up.weight",
-        "ffn_down.weight"
-    };
+static bool is_matmul_weight(const ggml_tensor * tensor) {
+    std::string                                  name = ggml_get_name(tensor);
+    static const std::unordered_set<std::string> weight_suffixes{ "output.weight",      "attn_q.weight",
+                                                                  "attn_k.weight",      "attn_v.weight",
+                                                                  "attn_output.weight", "ffn_gate.weight",
+                                                                  "ffn_up.weight",      "ffn_down.weight" };

-    for (const auto& suffix : weight_suffixes) {
+    for (const auto & suffix : weight_suffixes) {
        if (name.find(suffix) != std::string::npos) {
            return true;
        }
@@ -1078,14 +1063,13 @@ static bool is_matmul_weight(const ggml_tensor* tensor) {
 * @param ctx The CANN backend context used to manage execution and resources.
 * @param dst The destination tensor.
 */
-template <auto binary_op>
-void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
-    ggml_tensor* src1 = dst->src[1];
+template <auto binary_op> void ggml_cann_binary_op(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src0 = dst->src[0];
+    ggml_tensor * src1 = dst->src[1];

-    aclTensor* acl_src0;
-    aclTensor* acl_src1;
-    aclTensor* acl_dst;
+    aclTensor * acl_src0;
+    aclTensor * acl_src1;
+    aclTensor * acl_dst;

    // Need bcast
    bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
@@ -1094,7 +1078,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
 }

-
 /**
 * @brief Applies a unary operation to an input tensor using the CANN backend.
 *
@@ -1107,12 +1090,12 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 * @param ctx The CANN backend context for managing resources and execution.
 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
 */
-template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
-    void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src = dst->src[0];
+template <void unary_op(ggml_backend_cann_context &, aclTensor *, aclTensor *)>
+void ggml_cann_op_unary(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src = dst->src[0];

-    aclTensor* acl_src = ggml_cann_create_tensor(src);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    aclTensor * acl_src = ggml_cann_create_tensor(src);
+    aclTensor * acl_dst = ggml_cann_create_tensor(dst);

    unary_op(ctx, acl_src, acl_dst);
    ggml_cann_release_resources(ctx, acl_src, acl_dst);
@@ -1138,9 +1121,9 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 *
 * @see GGML_CANN_CALL_OP_UNARY
 */
-void ggml_cann_op_unary(
-    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
-    ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+                        ggml_backend_cann_context &                                                ctx,
+                        ggml_tensor *                                                              dst);

 /**
 * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
@@ -1172,9 +1155,9 @@ void ggml_cann_op_unary(
 *
 * @see GGML_CANN_CALL_OP_UNARY_GATED
 */
-void ggml_cann_op_unary_gated(
-    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
-    ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+                              ggml_backend_cann_context &                                                ctx,
+                              ggml_tensor *                                                              dst);

 /**
 * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
@@ -1197,16 +1180,13 @@ void ggml_cann_op_unary_gated(
 * @see ggml_cann_op_unary
 * @see GGML_CANN_CALL_ACLNN_OP
 */
-#define GGML_CANN_CALL_OP_UNARY(OP_NAME)                              \
-    do {                                                              \
-        auto lambda = [](ggml_backend_cann_context& ctx,              \
-            aclTensor* acl_src,                                       \
-            aclTensor* acl_dst) {                                     \
-            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
-        };                                                            \
-        ggml_cann_op_unary(lambda, ctx, dst);                         \
-    }                                                                 \
-    while (0)
+#define GGML_CANN_CALL_OP_UNARY(OP_NAME)                                                              \
+    do {                                                                                              \
+        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
+            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);                                  \
+        };                                                                                            \
+        ggml_cann_op_unary(lambda, ctx, dst);                                                         \
+    } while (0)

 /**
 * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
@@ -1229,15 +1209,12 @@ void ggml_cann_op_unary_gated(
 * @see ggml_cann_op_unary_gated
 * @see GGML_CANN_CALL_ACLNN_OP
 */
-#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                        \
-    do {                                                              \
-        auto lambda = [](ggml_backend_cann_context& ctx,              \
-            aclTensor* acl_src,                                       \
-            aclTensor* acl_dst) {                                     \
-            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
-        };                                                            \
-        ggml_cann_op_unary_gated(lambda, ctx, dst);                   \
-    }                                                                 \
-    while (0)
+#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                                                        \
+    do {                                                                                              \
+        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
+            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);                                  \
+        };                                                                                            \
+        ggml_cann_op_unary_gated(lambda, ctx, dst);                                                   \
+    } while (0)

 #endif  // CANN_ACLNN_OPS
@@ -44,7 +44,7 @@
 #include "../include/ggml.h"
 #include "../ggml-impl.h"

-#define MATRIX_ROW_PADDING 512
+#define MATRIX_ROW_PADDING    512
 #define GGML_CANN_MAX_STREAMS 8

 /**
@@ -56,8 +56,7 @@
 * @param line The line number at which the error occurred.
 * @param msg The error message.
 */
-[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
-                                  const char* file, int line, const char* msg);
+[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg);

 /**
 * @brief Checks the result of a CANN function call and invokes the error
@@ -89,25 +88,24 @@ struct ggml_cann_device_info {
     * @brief Information about a single CANN device.
     */
    struct cann_device_info {
-        int cc;                 /**< Compute capability.                   */
+        int    cc;              /**< Compute capability.                   */
        size_t smpb;            /**< Maximum shared memory per block.      */
-        bool vmm;               /**< Virtual memory support.               */
+        bool   vmm;             /**< Virtual memory support.               */
        size_t vmm_granularity; /**< Granularity of virtual memory.        */
        size_t total_vram;      /**< Total video RAM available on the device. */
    };

-    cann_device_info devices[GGML_CANN_MAX_DEVICES] =
-        {}; /**< Array of CANN device information. */
+    cann_device_info devices[GGML_CANN_MAX_DEVICES] = {}; /**< Array of CANN device information. */
 };

-const ggml_cann_device_info& ggml_cann_info();
+const ggml_cann_device_info & ggml_cann_info();

-void ggml_cann_set_device(int32_t device);
+void    ggml_cann_set_device(int32_t device);
 int32_t ggml_cann_get_device();

-std::optional<std::string> get_env(const std::string& name);
-bool parse_bool(const std::string& value);
-int parse_integer(const std::string& value);
+std::optional<std::string> get_env(const std::string & name);
+bool                       parse_bool(const std::string & value);
+int                        parse_integer(const std::string & value);

 /**
 * @brief Abstract base class for memory pools used by CANN.
@@ -126,7 +124,7 @@ struct ggml_cann_pool {
     *                     will be stored.
     * @return             Pointer to the allocated memory block.
     */
-    virtual void* alloc(size_t size, size_t* actual_size) = 0;
+    virtual void * alloc(size_t size, size_t * actual_size) = 0;

    /**
     * @brief Frees a previously allocated memory block.
@@ -136,16 +134,16 @@ struct ggml_cann_pool {
     * @note Note that all CANN opertors are running async. Make sure memory is
     *       still avaiable before this operator finished.
     */
-    virtual void free(void* ptr, size_t size) = 0;
+    virtual void free(void * ptr, size_t size) = 0;
 };

 /**
 * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
 */
 struct ggml_cann_pool_alloc {
-    ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
-    void* ptr = nullptr;    /**< Pointer to the allocated memory block. */
-    size_t actual_size = 0; /**< Actual size of the allocated memory block. */
+    ggml_cann_pool * pool        = nullptr; /**< Pointer to the memory pool. */
+    void *           ptr         = nullptr; /**< Pointer to the allocated memory block. */
+    size_t           actual_size = 0;       /**< Actual size of the allocated memory block. */

    /**
     * @brief Default constructor.
@@ -156,16 +154,14 @@ struct ggml_cann_pool_alloc {
     * @brief Constructor that initializes the memory pool.
     * @param pool Reference to the memory pool.
     */
-    explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
+    explicit ggml_cann_pool_alloc(ggml_cann_pool & pool) : pool(&pool) {}

    /**
     * @brief Constructor that initializes the memory pool and allocates memory.
     * @param pool Reference to the memory pool.
     * @param size Size of the memory block to allocate.
     */
-    ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
-        alloc(size);
-    }
+    ggml_cann_pool_alloc(ggml_cann_pool & pool, size_t size) : pool(&pool) { alloc(size); }

    /**
     * @brief Destructor that frees the allocated memory block.
@@ -181,7 +177,7 @@ struct ggml_cann_pool_alloc {
     * @param size Size of the memory block to allocate.
     * @return Pointer to the allocated memory block.
     */
-    void* alloc(size_t size) {
+    void * alloc(size_t size) {
        GGML_ASSERT(pool != nullptr);
        GGML_ASSERT(ptr == nullptr);
        ptr = pool->alloc(size, &this->actual_size);
@@ -194,7 +190,7 @@ struct ggml_cann_pool_alloc {
     * @param size Size of the memory block to allocate.
     * @return Pointer to the allocated memory block.
     */
-    void* alloc(ggml_cann_pool& pool, size_t size) {
+    void * alloc(ggml_cann_pool & pool, size_t size) {
        this->pool = &pool;
        return alloc(size);
    }
@@ -203,25 +199,25 @@ struct ggml_cann_pool_alloc {
     * @brief Gets the pointer to the allocated memory block.
     * @return Pointer to the allocated memory block.
     */
-    void* get() { return ptr; }
+    void * get() { return ptr; }

    // Deleted copy constructor
-    ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
+    ggml_cann_pool_alloc(const ggml_cann_pool_alloc &) = delete;

    // Deleted move constructor
-    ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
+    ggml_cann_pool_alloc(ggml_cann_pool_alloc &&) = delete;

    // Deleted copy assignment operator
-    ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
+    ggml_cann_pool_alloc & operator=(const ggml_cann_pool_alloc &) = delete;

    // Deleted move assignment operator
-    ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
+    ggml_cann_pool_alloc & operator=(ggml_cann_pool_alloc &&) = delete;
 };

 /**
 * @brief Function pointer type for ACLNN operator calls.
 */
-using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStream);
+using aclnn_func_t = aclnnStatus (*)(void *, uint64_t, aclOpExecutor *, aclrtStream);

 /**
 * @brief Base class for all CANN tasks to be submitted to the task queue.
@@ -229,7 +225,7 @@ using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStrea
 * Users should override the run_task() method with actual task logic.
 */
 class cann_task {
-public:
+  public:
    virtual void run_task() {}
 };

@@ -237,16 +233,20 @@ public:
 * @brief A lock-free ring-buffer based task queue for asynchronously executing cann_task instances.
 */
 class cann_task_queue {
-public:
+  public:
    /**
     * @brief Constructs a task queue with a fixed power-of-two capacity for a specific device.
     *
     * @param capacity Queue capacity. Must be a power of 2.
     * @param device Target device ID (used for context setting).
     */
-    explicit cann_task_queue(size_t capacity, int32_t device)
-        : buffer_(capacity), capacity_(capacity), head_(0), tail_(0),
-          running_(false), device_(device) {
+    explicit cann_task_queue(size_t capacity, int32_t device) :
+        buffer_(capacity),
+        capacity_(capacity),
+        head_(0),
+        tail_(0),
+        running_(false),
+        device_(device) {
        GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2");
        mask_ = capacity_ - 1;
    }
@@ -257,7 +257,7 @@ public:
     * @param item Unique pointer to the task.
     * @return true if the task was successfully enqueued, false if the queue was full.
     */
-    bool enqueue(std::unique_ptr<cann_task>&& item) {
+    bool enqueue(std::unique_ptr<cann_task> && item) {
        size_t next_tail = (tail_ + 1) & mask_;

        if (next_tail == head_) {
@@ -276,17 +276,16 @@ public:
     *
     * @param task Task to be submitted.
     */
-    void submit_task(std::unique_ptr<cann_task>&& task) {
-        while(!enqueue(std::move(task))) {
+    void submit_task(std::unique_ptr<cann_task> && task) {
+        while (!enqueue(std::move(task))) {
            std::this_thread::yield();
            continue;
        }

        if (!running_) {
            running_ = true;
-            thread_ = std::thread(&cann_task_queue::execute, this);
+            thread_  = std::thread(&cann_task_queue::execute, this);
        }
-
    }

    /**
@@ -309,7 +308,7 @@ public:
        }
    }

-private:
+  private:
    /**
     * @brief Worker thread function that continuously dequeues and executes tasks.
     */
@@ -317,7 +316,7 @@ private:
        ggml_cann_set_device(device_);

        while (running_) {
-            if(head_ == tail_) {
+            if (head_ == tail_) {
                std::this_thread::yield();
                continue;
            }
@@ -330,24 +329,24 @@ private:
    }

    std::vector<std::unique_ptr<cann_task>> buffer_;
-    const size_t capacity_;
-    size_t mask_;
-    size_t head_;
-    size_t tail_;
-    bool running_;
-    std::thread thread_;
-    int32_t device_;
+    const size_t                            capacity_;
+    size_t                                  mask_;
+    size_t                                  head_;
+    size_t                                  tail_;
+    bool                                    running_;
+    std::thread                             thread_;
+    int32_t                                 device_;
 };

 #ifdef USE_ACL_GRAPH
 struct ggml_graph_node_properties {
    // dst tensor
-    void * node_address;
+    void *  node_address;
    int64_t ne[GGML_MAX_DIMS];
-    size_t nb[GGML_MAX_DIMS];
+    size_t  nb[GGML_MAX_DIMS];

    // src tensor
-    void * src_address[GGML_MAX_SRC];
+    void *  src_address[GGML_MAX_SRC];
    int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
    size_t  src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];

@@ -376,13 +375,11 @@ struct ggml_cann_graph {
 * move existing graphs to the front (most recently used), and clear the cache.
 */
 struct ggml_cann_graph_lru_cache {
-    size_t capacity;  /**< Maximum number of graphs in the cache. */
+    size_t capacity;                         /**< Maximum number of graphs in the cache. */

-    std::list<ggml_cann_graph*> cache_list; /**< List storing cached graphs as raw pointers. */
+    std::list<ggml_cann_graph *> cache_list; /**< List storing cached graphs as raw pointers. */

-    ggml_cann_graph_lru_cache() {
-        capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12"));
-    }
+    ggml_cann_graph_lru_cache() { capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); }

    /**
     * @brief Push a new graph to the front of the cache.
@@ -390,11 +387,11 @@ struct ggml_cann_graph_lru_cache {
     * @param new_node Pointer to the new ggml_cann_graph to cache.
     *        Ownership is transferred to the cache (cache will delete it).
     */
-    void push(ggml_cann_graph* new_node) {
+    void push(ggml_cann_graph * new_node) {
        if (cache_list.size() >= capacity) {
-            ggml_cann_graph* old = cache_list.back();
+            ggml_cann_graph * old = cache_list.back();
            cache_list.pop_back();
-            delete old; // free the old graph
+            delete old;  // free the old graph
        }
        cache_list.push_front(new_node);
    }
@@ -403,7 +400,7 @@ struct ggml_cann_graph_lru_cache {
     * @brief Move an existing graph to the front of the cache.
     * @param node Pointer to the ggml_cann_graph to move.
     */
-    void move_to_front(ggml_cann_graph* node) {
+    void move_to_front(ggml_cann_graph * node) {
        cache_list.remove(node);
        cache_list.push_front(node);
    }
@@ -421,92 +418,89 @@ struct ggml_cann_graph_lru_cache {
    /**
     * @brief Destructor that clears the cache and frees all cached graphs.
     */
-    ~ggml_cann_graph_lru_cache() {
-        clear();
-    }
+    ~ggml_cann_graph_lru_cache() { clear(); }
 };
 #endif  // USE_ACL_GRAPH

 struct ggml_cann_rope_cache {
    ~ggml_cann_rope_cache() {
-        if(theta_scale_cache != nullptr) {
+        if (theta_scale_cache != nullptr) {
            ACL_CHECK(aclrtFree(theta_scale_cache));
        }
-        if(sin_cache != nullptr) {
+        if (sin_cache != nullptr) {
            ACL_CHECK(aclrtFree(sin_cache));
        }
-        if(cos_cache != nullptr) {
+        if (cos_cache != nullptr) {
            ACL_CHECK(aclrtFree(cos_cache));
        }
    }

-    void* theta_scale_cache = nullptr;
+    void *  theta_scale_cache  = nullptr;
    int64_t theta_scale_length = 0;
    // sin/cos cache, used only to accelerate first layer on each device
-    void* sin_cache = nullptr;
-    void* cos_cache = nullptr;
-    int64_t position_length = 0;
+    void *  sin_cache          = nullptr;
+    void *  cos_cache          = nullptr;
+    int64_t position_length    = 0;
    // Properties to check before reusing the sincos cache
-    bool cached = false;
-    float ext_factor = 0.0f;
-    float theta_scale = 0.0f;
-    float freq_scale = 0.0f;
-    float attn_factor = 0.0f;
-    bool is_neox = false;
+    bool    cached             = false;
+    float   ext_factor         = 0.0f;
+    float   theta_scale        = 0.0f;
+    float   freq_scale         = 0.0f;
+    float   attn_factor        = 0.0f;
+    bool    is_neox            = false;
 };

 struct ggml_cann_tensor_cache {
    ~ggml_cann_tensor_cache() {
-        if(cache != nullptr) {
+        if (cache != nullptr) {
            ACL_CHECK(aclrtFree(cache));
        }
    }

-    void* cache = nullptr;
-    int64_t size = 0;
+    void *  cache = nullptr;
+    int64_t size  = 0;
 };

 /**
 * @brief Context for managing CANN backend operations.
 */
 struct ggml_backend_cann_context {
-    int32_t device;                  /**< Device ID. */
-    std::string name;                /**< Name of the device. */
-    std::string description;         /**< Description of the device. */
-    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
+    int32_t     device;               /**< Device ID. */
+    std::string name;                 /**< Name of the device. */
+    std::string description;          /**< Description of the device. */
+    aclrtEvent  copy_event = nullptr; /**< Event for managing copy operations. */
 #ifdef USE_ACL_GRAPH
    /// Cached CANN ACL graph used for executing the current ggml computation graph.
    ggml_cann_graph_lru_cache graph_lru_cache;
-    bool acl_graph_mode = true;
+    bool                      acl_graph_mode = true;
 #endif
-    cann_task_queue task_queue;
-    bool async_mode;
+    cann_task_queue        task_queue;
+    bool                   async_mode;
    // Rope Cache
-    ggml_cann_rope_cache rope_cache;
+    ggml_cann_rope_cache   rope_cache;
    // Constant Pool
    ggml_cann_tensor_cache rms_norm_one_tensor_cache;
    ggml_cann_tensor_cache rms_norm_zero_tensor_cache;

-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = { nullptr }; /**< Array of streams for the device. */

    /**
     * @brief Constructor for initializing the context with a given device.
     * @param device Device ID.
     */
-    explicit ggml_backend_cann_context(int device)
-        : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
+    explicit ggml_backend_cann_context(int device) :
+        device(device),
+        name("CANN" + std::to_string(device)),
+        task_queue(1024, device) {
        ggml_cann_set_device(device);
        description = aclrtGetSocName();

        async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
-        GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
-            device, async_mode ? "ON" : "OFF");
+        GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__, device, async_mode ? "ON" : "OFF");
 #ifdef USE_ACL_GRAPH
        acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
-        GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
-              __func__, device,
-              acl_graph_mode ? "GRAPH" : "EAGER",
-              acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
+        GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n", __func__, device, acl_graph_mode ? "GRAPH" : "EAGER",
+                      acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
 #endif
    }

@@ -549,8 +543,7 @@ struct ggml_backend_cann_context {
    aclrtStream stream() { return stream(0); }

    // TODO: each stream should have a memory pool.
-    std::unique_ptr<ggml_cann_pool>
-        mem_pool; /**< Memory pool for the device. */
+    std::unique_ptr<ggml_cann_pool> mem_pool; /**< Memory pool for the device. */

    /**
     * @brief Create a new memory pool for a given device.
@@ -563,7 +556,7 @@ struct ggml_backend_cann_context {
     * @brief Get or create the memory pool for the context.
     * @return Reference to the memory pool.
     */
-    ggml_cann_pool& pool() {
+    ggml_cann_pool & pool() {
        if (mem_pool == nullptr) {
            mem_pool = new_pool_for_device(device);
        }
@@ -2184,6 +2184,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_FLOOR:
+                case GGML_UNARY_OP_CEIL:
+                case GGML_UNARY_OP_ROUND:
+                case GGML_UNARY_OP_TRUNC:
                    {
                        n_tasks = 1;
                    } break;
@@ -3563,13 +3567,17 @@ void ggml_cpu_init(void) {
 #ifdef GGML_USE_OPENMP
            //if (!getenv("OMP_WAIT_POLICY")) {
            //    // set the wait policy to active, so that OpenMP threads don't sleep
-            //    putenv("OMP_WAIT_POLICY=active");
+            //    setenv("OMP_WAIT_POLICY", "active", 0)
            //}

            if (!getenv("KMP_BLOCKTIME")) {
                // set the time to wait before sleeping a thread
                // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
-                putenv("KMP_BLOCKTIME=200"); // 200ms
+#ifdef _WIN32
+                _putenv_s("KMP_BLOCKTIME", "200"); // 200ms
+#else
+                setenv("KMP_BLOCKTIME", "200", 0); // 200ms
+#endif
            }
 #endif
        }
@@ -8993,6 +8993,22 @@ void ggml_compute_forward_unary(
            {
                ggml_compute_forward_exp(params, dst);
            } break;
+        case GGML_UNARY_OP_FLOOR:
+            {
+                ggml_compute_forward_floor(params, dst);
+            } break;
+        case GGML_UNARY_OP_CEIL:
+            {
+                ggml_compute_forward_ceil(params, dst);
+            } break;
+        case GGML_UNARY_OP_ROUND:
+            {
+                ggml_compute_forward_round(params, dst);
+            } break;
+        case GGML_UNARY_OP_TRUNC:
+            {
+                ggml_compute_forward_trunc(params, dst);
+            } break;
        case GGML_UNARY_OP_XIELU:
            {
                ggml_compute_forward_xielu(params, dst);
@@ -73,6 +73,22 @@ static inline float op_log(float x) {
    return logf(x);
 }

+static inline float op_floor(float x) {
+    return floorf(x);
+}
+
+static inline float op_ceil(float x) {
+    return ceilf(x);
+}
+
+static inline float op_round(float x) {
+    return roundf(x);
+}
+
+static inline float op_trunc(float x) {
+    return truncf(x);
+}
+
 template <float (*op)(float), typename src0_t, typename dst_t>
 static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
    unary_op<op_log>(params, dst);
 }

+void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_floor>(params, dst);
+}
+
+void ggml_compute_forward_ceil(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_ceil>(params, dst);
+}
+
+void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_round>(params, dst);
+}
+
+void ggml_compute_forward_trunc(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_trunc>(params, dst);
+}
+
 void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
    const float alpha_n = ggml_get_op_params_f32(dst, 1);
    const float alpha_p = ggml_get_op_params_f32(dst, 2);
@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
 void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_trunc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);

 #ifdef __cplusplus
@@ -3625,9 +3625,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CONV_2D_DW:
        case GGML_OP_CONV_TRANSPOSE_2D:
        case GGML_OP_POOL_2D:
-        case GGML_OP_SUM:
        case GGML_OP_ACC:
            return true;
+        case GGML_OP_SUM:
+            return ggml_is_contiguous_rows(op->src[0]);
        case GGML_OP_ARGSORT:
            // TODO: Support arbitrary column width
            return op->src[0]->ne[0] <= 1024;
@@ -662,6 +662,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
        case GGML_OP_LOG:
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_SUM:
+            return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
        case GGML_OP_SUM_ROWS:
        case GGML_OP_MEAN:
        case GGML_OP_SOFT_MAX:
@@ -866,12 +866,25 @@ int ggml_metal_op_sum(ggml_metal_op_t ctx, int idx) {

    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum(lib, op);

+    int nth = 32; // SIMD width
+
+    while (nth < (int) n && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+
+    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    nth = std::min(nth, (int) n);
+
+    const int nsg = (nth + 31) / 32;
+
    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);

-    ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 1, 1, 1);
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, nsg * sizeof(float), 0);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, nth, 1, 1);

    return 1;
 }
@@ -1727,18 +1727,48 @@ kernel void kernel_op_sum_f32(
        constant ggml_metal_kargs_sum & args,
        device const float * src0,
        device       float * dst,
-        ushort  tiitg[[thread_index_in_threadgroup]]) {
+        threadgroup  float * shmem_f32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {

-    if (tiitg != 0) {
+    if (args.np == 0) {
        return;
    }

-    float acc = 0.0f;
-    for (ulong i = 0; i < args.np; ++i) {
-        acc += src0[i];
+    const uint nsg = (ntg.x + 31) / 32;
+
+    float sumf = 0;
+
+    for (int64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
+        sumf += src0[i0];
    }

-    dst[0] = acc;
+    sumf = simd_sum(sumf);
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    float total = 0;
+
+    if (sgitg == 0) {
+        float v = 0;
+
+        if (tpitg.x < nsg) {
+            v = shmem_f32[tpitg.x];
+        }
+
+        total = simd_sum(v);
+
+        if (tpitg.x == 0) {
+            dst[0] = total;
+        }
+    }
 }

 template <bool norm>
@@ -93,6 +93,7 @@ set(GGML_OPENCL_KERNELS
    mul_mv_id_mxfp4_f32_flat
    mul_mm_f32_f32_l4_lm
    mul_mm_f16_f32_l4_lm
+    mul_mm_q8_0_f32_l4_lm
    mul
    norm
    relu
@@ -408,6 +408,7 @@ struct ggml_backend_opencl_context {
    cl_program program_mul_mv_id_mxfp4_f32_flat;
    cl_program program_mul_mm_f32_f32_l4_lm;
    cl_program program_mul_mm_f16_f32_l4_lm;
+    cl_program program_mul_mm_q8_0_f32_l4_lm;

    cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
    cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
@@ -480,6 +481,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
    cl_kernel kernel_mul_mm_f32_f32_l4_lm;
    cl_kernel kernel_mul_mm_f16_f32_l4_lm;
+    cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;

    std::vector<ProfilingInfo> profiling_info;

@@ -1191,6 +1193,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        GGML_LOG_CONT(".");
    }

+    // mul_mm_q8_0_f32_l4_lm
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mm_q8_0_f32_l4_lm.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mm_q8_0_f32_l4_lm.cl");
+#endif
+        backend_ctx->program_mul_mm_q8_0_f32_l4_lm =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_q8_0_f32_l4_lm, "kernel_mul_mm_q8_0_f32_l4_lm", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
    // mul
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -6961,6 +6979,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
                return;
            }
+            case GGML_TYPE_Q8_0: {
+                if (ne11 < 32) {
+                    break;
+                }
+                kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
+                nth0 = 128; // calculated as (BM*BN)/(TM*TN)
+
+                int batch_stride_a = ne00*ne01;
+                int batch_stride_b = ne10*ne11;
+                int batch_stride_d = ne0*ne1;
+
+                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
+                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q8_0->d));
+                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne11));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10)); // stride_a
+                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_b
+                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne01)); // stride_d
+                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &batch_stride_a));
+                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_b));
+                CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_d));
+                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
+                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
+
+                // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
+                size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
+                size_t local_work_size[] = {(size_t)nth0, 1, 1};
+
+                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                return;
+            }
            default:
                break;
        }
@@ -4,6 +4,7 @@
 #define ACC_TYPE4 float4
 #define DATA_TYPE float
 #define DATA_TYPE4 float4
+#define MASK_DATA_TYPE half
 #define CONVERT_ACC4(x) (x)
 #define CONVERT_DATA4(x) (x)

@@ -148,7 +149,7 @@ __kernel void flash_attn_f32(
            if (k_row1 >= n_kv) score1 = -INFINITY;

            if (mask_base != NULL) {
-                const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
+                const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
                if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
                if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
            }
@@ -281,7 +282,7 @@ __kernel void flash_attn_f32_q1(
        }
        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
        if (mask_base != NULL) {
-            const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base);
+            const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base);
            score += slope * (ACC_TYPE)mask_ptr[k_idx];
        }
        if (logit_softcap > 0.0f) {
@@ -317,7 +318,7 @@ __kernel void flash_attn_f32_q1(
        }
        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
        if (mask_base != NULL) {
-            const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base);
+            const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base);
            score += slope * (ACC_TYPE)mask_ptr[k_idx];
        }
        if (logit_softcap > 0.0f) {
@@ -79,19 +79,33 @@ kernel void kernel_mul_mm_f16_f32_l4_lm(

    for (int block = 0; block < ne00; block += BK) {
        for (int l = 0; l < BM; l += loadstride_a) {
+            if (loadc_a + l < ne01) {
            const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
-            buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
-            buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
-            buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
-            buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
+                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
+                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
+                buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
+                buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
+            } else {
+                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = 0.0h;
+                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = 0.0h;
+                buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = 0.0h;
+                buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = 0.0h;
+            }
        }

        for (int l = 0; l < BN; l += loadstride_b) {
-            const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
-            buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
-            buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
-            buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
-            buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            if (loadc_b + l < ne11) {
+                const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            } else {
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0h;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0h;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0h;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0h;
+            }
        }

        barrier(CLK_LOCAL_MEM_FENCE);
@@ -79,19 +79,33 @@ kernel void kernel_mul_mm_f32_f32_l4_lm(

    for (int block = 0; block < ne00; block += BK) {
        for (int l = 0; l < BM; l += loadstride_a) {
-            const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
-            buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
-            buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
-            buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
-            buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
+            if (loadc_a + l < ne01) {
+                const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
+                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
+                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
+                buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
+                buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
+            } else {
+                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = 0.0f;
+            }
        }

        for (int l = 0; l < BN; l += loadstride_b) {
-            const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
-            buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
-            buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
-            buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
-            buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            if (loadc_b + l < ne11) {
+                const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            } else {
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
+            }
        }

        barrier(CLK_LOCAL_MEM_FENCE);
@@ -0,0 +1,154 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define LOAD_VEC_A 4
+#define LOAD_VEC_B 4
+
+#define BM 64
+#define BN 64
+#define BK 32
+#define TM 4
+#define TN 8
+
+kernel void kernel_mul_mm_q8_0_f32_l4_lm(
+    global char4  * src0_q,
+    global half   * src0_d,
+    global float4 * src1,
+    ulong offset1,
+    global float  * dst,
+    ulong offsetd,
+
+    int ne00,
+    int ne01,
+    int ne02,
+    int ne11,
+    int ne12,
+
+    int stride_a,
+    int stride_b,
+    int stride_d,
+
+    int batch_stride_a,
+    int batch_stride_b,
+    int batch_stride_d,
+
+    int r2,
+    int r3
+) {
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst  = (global float *)((global char*)dst  + offsetd);
+
+    local float buf_a[BM * BK];
+    local float buf_b[BN * BK];
+
+    const int batch_idx = get_global_id(2);
+
+    const int i13 = batch_idx / ne12;
+    const int i12 = batch_idx % ne12;
+
+    const int i03 = i13 / r3;
+    const int i02 = i12 / r2;
+
+    const int batch_idx_a = i03 * ne02 + i02;
+
+    const int ir = get_group_id(0);
+    const int ic = get_group_id(1);
+
+    const int tid = get_local_id(0);
+    const int th_r  = tid % (BM / TM);
+    const int th_c  = tid / (BM / TM);
+
+    const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
+    const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
+    const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
+    const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
+
+    const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
+    const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
+
+    int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
+    int pos_b = (batch_idx   * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
+
+    float sums[TM * TN];
+    float cache_a[TM];
+    float cache_b[TN];
+
+    for (int i = 0; i < TM * TN; i++) {
+        sums[i] = 0.0f;
+    }
+
+    for (int block = 0; block < ne00; block += BK) {
+        for (int l = 0; l < BM; l += loadstride_a) {
+            if (loadc_a + l < ne01) {
+                int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
+                int ib  = idx / 8;
+                int iqs = idx % 8;
+
+                float d = (float)src0_d[ib];
+                global char4 * qs = src0_q + ib*8 + iqs;
+                char4 q = *qs;
+                float4 v = convert_float4(q)*d;
+
+                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = v.s0;
+                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = v.s1;
+                buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = v.s2;
+                buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = v.s3;
+            } else {
+                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = 0.0f;
+            }
+        }
+
+        for (int l = 0; l < BN; l += loadstride_b) {
+            if (loadc_b + l < ne11) {
+                int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            } else {
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        pos_a += BK / LOAD_VEC_A;
+        pos_b += BK / LOAD_VEC_B;
+
+        for (int i = 0; i < BK; i++) {
+            for (int j = 0; j < TM; j++) {
+                cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
+            }
+
+            for (int j = 0; j < TN; j++) {
+                cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
+            }
+
+            for (int cc = 0; cc < TN; cc++) {
+                for (int cr = 0; cr < TM; cr++) {
+                    const int sums_idx = cc*TM + cr;
+                    sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    const int dr = ir * BM + th_r * TM;
+    const int dc = ic * BN + th_c * TN;
+
+    const int offsets = batch_idx * batch_stride_d;
+
+    for (int cc = 0; cc < TN; cc++) {
+        for (int cr = 0; cr < TM; cr++) {
+            if (dr + cr < ne01 && dc + cc < ne11) {
+                dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
+            }
+        }
+    }
+}
@@ -397,6 +397,14 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
        });
 }

+template<typename T>
+static void arange_kernel(T * dst, const int k, T start, T step,
+                         const sycl::nd_item<1> &item_ct1) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = start + static_cast<T>(i) * step;
+    }
+}
+
 template<typename T>
 static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
                             const int nb02, const int nb03, const int ne10, const int ne11,
@@ -565,6 +573,25 @@ static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx
 }


+static inline void ggml_sycl_op_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    float start, stop, step;
+    memcpy(&start, dst->op_params, sizeof(float));
+    memcpy(&stop, (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&step, (float *) dst->op_params + 2, sizeof(float));
+    dpct::queue_ptr stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    float * dst_ptr = (float *)dst->data;
+    const int k = (int)ggml_nelements(dst);
+    const int num_blocks = ceil_div(k, SYCL_ARANGE_BLOCK_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE),
+                          sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE)),
+        [=](sycl::nd_item<1> item_ct1) {
+            arange_kernel(dst_ptr, k, start, step, item_ct1);
+        });
+}
+
 } // namespace ggml_sycl_detail


@@ -1090,3 +1117,8 @@ void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    ggml_sycl_op_geglu_quick(ctx, dst);
 }
+
+void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/0);
+    ggml_sycl_detail::ggml_sycl_op_arange(ctx, dst);
+}
@@ -81,4 +81,6 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

+void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
 #endif // GGML_SYCL_ELEMENTWISE_HPP
@@ -42,6 +42,7 @@
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
 #include "ggml-sycl/set_rows.hpp"
+#include "ggml-sycl/set.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
 #include "ggml-sycl/getrows.hpp"
 #include "ggml-sycl/quantize.hpp"
@@ -2151,6 +2152,30 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *
    sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
 }

+inline void ggml_sycl_op_mean(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+    float *       dst_dd  = static_cast<float *>(dst->data);
+
+    const int64_t ncols = dst->src[0]->ne[0];
+    const int64_t nrows = ggml_nrows(dst->src[0]);
+
+    sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
+
+    main_stream->parallel_for(
+        sycl::range<1>(nrows),
+        [=](sycl::id<1> row) {
+            dst_dd[row] /= ncols;
+        }
+    );
+}
+
+
 inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_I32);
@@ -3535,6 +3560,12 @@ static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * ds
    ggml_sycl_op_sum_rows(ctx, dst);
 }

+static void ggml_sycl_mean(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
+    ggml_sycl_op_mean(ctx, dst);
+}
+
 static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
@@ -3589,6 +3620,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_GET_ROWS:
            ggml_sycl_get_rows(ctx, dst);
            break;
+        case GGML_OP_SET:
+            ggml_sycl_op_set(ctx, dst);
+            break;
        case GGML_OP_SET_ROWS:
            ggml_sycl_op_set_rows(ctx, dst);
            break;
@@ -3784,6 +3818,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_SUM_ROWS:
            ggml_sycl_sum_rows(ctx, dst);
            break;
+        case GGML_OP_MEAN:
+            ggml_sycl_mean(ctx, dst);
+            break;
        case GGML_OP_ARGSORT:
            ggml_sycl_argsort(ctx, dst);
            break;
@@ -3799,6 +3836,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_GATED_LINEAR_ATTN:
            ggml_sycl_op_gated_linear_attn(ctx, dst);
            break;
+        case GGML_OP_ARANGE:
+            ggml_sycl_arange(ctx, dst);
+            break;
        default:
            return false;
    }
@@ -4295,6 +4335,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                        return false;
                }
            }
+         case GGML_OP_SET:
+               return (op->type == GGML_TYPE_F32) &&
+                      (op->src[0] && op->src[1]) &&
+                      (op->src[0]->type == GGML_TYPE_F32) &&
+                      (op->src[1]->type == GGML_TYPE_F32);
+
        case GGML_OP_SET_ROWS:
            {
                return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
@@ -4431,6 +4477,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
        case GGML_OP_ARGSORT:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_POOL_2D:
@@ -4444,6 +4491,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_RWKV_WKV7:
        case GGML_OP_GATED_LINEAR_ATTN:
            return true;
+        case GGML_OP_ARANGE:
+            return op->type == GGML_TYPE_F32;
        default:
            return false;
    }
@@ -31,6 +31,7 @@
 #define SYCL_SQRT_BLOCK_SIZE 256
 #define SYCL_SIN_BLOCK_SIZE 256
 #define SYCL_SQR_BLOCK_SIZE 256
+#define SYCL_SET_BLOCK_SIZE 256
 #define SYCL_CPY_BLOCK_SIZE 32
 #define SYCL_SCALE_BLOCK_SIZE 256
 #define SYCL_CLAMP_BLOCK_SIZE 256
@@ -49,6 +50,7 @@
 #define SYCL_ARGMAX_BLOCK_SIZE 256
 #define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
 #define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
+#define SYCL_ARANGE_BLOCK_SIZE 256

 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_SYCL_DMMV_X
@@ -0,0 +1,73 @@
+#include "presets.hpp"
+#include "common.hpp"
+#include "ggml.h"
+#include "set.hpp"
+#include <cstdint>
+#include <sycl/sycl.hpp>
+using namespace sycl;
+
+// Internal function: perform element-wise set operation for each thread
+inline void set_f32(const float* src, float* dst,
+                    const int64_t ne0, const int64_t ne1,
+                    const int64_t ne2, const int64_t ne3,
+                    const int64_t nb[3], const int64_t src_nb[3],
+                    const int64_t offset_elem,
+                    const nd_item<1>& item)
+{
+    const size_t idx = item.get_global_id(0);
+    const size_t total = ne0 * ne1 * ne2 * ne3;
+    if (idx >= total) return;
+
+    // Convert linear index to 4D indices
+    const size_t i3 = idx / (ne2 * ne1 * ne0);
+    const size_t rem = idx % (ne2 * ne1 * ne0);
+    const size_t i2 = rem / (ne1 * ne0);
+    const size_t rem2 = rem % (ne1 * ne0);
+    const size_t i1 = rem2 / ne0;
+    const size_t i0 = rem2 % ne0;
+
+    // Compute source and destination indices and copy
+    dst[i0 + i1*nb[0] + i2*nb[1] + i3*nb[2] + offset_elem] =
+        src[i0 + i1*src_nb[0] + i2*src_nb[1] + i3*src_nb[2]];
+}
+
+// Main function: prepare GPU queue and launch parallel_for
+void ggml_sycl_op_set(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
+    const ggml_tensor* src0 = dst->src[0];
+    const ggml_tensor* src1 = dst->src[1];
+
+    // Ensure shapes and types are compatible
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+    GGML_ASSERT(dst->type == src0->type && src0->type == src1->type && dst->type == GGML_TYPE_F32);
+
+    const int32_t* opts = (const int32_t*) dst->op_params;
+    const int64_t nb[3]     = {opts[0]/sizeof(float), opts[1]/sizeof(float), opts[2]/sizeof(float)};
+    const int64_t offset_elem = opts[3] / sizeof(float);
+    const bool inplace = opts[4];
+
+    float* dst_ptr = (float*) dst->data;
+    const float* src0_ptr = (const float*) src0->data;
+    const float* src1_ptr = (const float*) src1->data;
+
+    queue_ptr stream = ctx.stream();
+
+    // Copy src0 to dst if not inplace
+    if (!inplace)
+        stream->memcpy(dst_ptr, src0_ptr, ggml_nbytes(dst));
+
+    const int64_t ne[4] = {src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]};
+    const int64_t src_nb[3] = {src1->nb[1]/sizeof(float), src1->nb[2]/sizeof(float), src1->nb[3]/sizeof(float)};
+
+    const size_t total_threads = ne[0]*ne[1]*ne[2]*ne[3];
+    const size_t grid_size = ((total_threads + SYCL_SET_BLOCK_SIZE - 1) / SYCL_SET_BLOCK_SIZE) * SYCL_SET_BLOCK_SIZE;
+
+    // Copy src0 to dst if not inplace
+    stream->parallel_for(
+        nd_range<1>(range<1>(grid_size), range<1>(SYCL_SET_BLOCK_SIZE)),
+        [=](nd_item<1> item) {
+            set_f32(src1_ptr, dst_ptr,
+                ne[0], ne[1], ne[2], ne[3],
+                nb, src_nb, offset_elem, item); }
+    );
+}
@@ -0,0 +1,5 @@
+#pragma once
+#include "backend.hpp"
+#include "ggml.h"
+
+void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
@@ -1144,9 +1144,13 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "EXP",
    "GELU_ERF",
    "XIELU",
+    "FLOOR",
+    "CEIL",
+    "ROUND",
+    "TRUNC",
 };

-static_assert(GGML_UNARY_OP_COUNT == 16, "GGML_UNARY_OP_COUNT != 16");
+static_assert(GGML_UNARY_OP_COUNT == 20, "GGML_UNARY_OP_COUNT != 20");

 static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
    "REGLU",
@@ -2749,6 +2753,62 @@ static struct ggml_tensor * ggml_glu_impl(
    return result;
 }

+// ggml_floor
+
+struct ggml_tensor * ggml_floor(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_FLOOR);
+}
+
+struct ggml_tensor * ggml_floor_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_FLOOR);
+}
+
+// ggml_ceil
+
+struct ggml_tensor * ggml_ceil(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_CEIL);
+}
+
+struct ggml_tensor * ggml_ceil_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_CEIL);
+}
+
+//ggml_round
+
+struct ggml_tensor * ggml_round(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_ROUND);
+}
+
+struct ggml_tensor * ggml_round_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ROUND);
+}
+
+//ggml_trunc
+
+struct ggml_tensor * ggml_trunc(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_TRUNC);
+}
+
+struct ggml_tensor * ggml_trunc_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TRUNC);
+}
+
 struct ggml_tensor * ggml_glu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
@@ -91,6 +91,7 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
           tensor.tensor_type not in (
                gguf.GGMLQuantizationType.F32,
                gguf.GGMLQuantizationType.F16,
+                gguf.GGMLQuantizationType.BF16,
           ):
            raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
    logger.info(f"* Preparing to convert from {file_endian} to {order}")
@@ -148,6 +149,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None

            # restore old shape in case it's ever used
            tensor.data.resize(oldshape)
+        elif tensor.tensor_type == gguf.GGMLQuantizationType.BF16:
+            # Special case for BF16
+            # It is 2-bytes data, but by default view loads it as 1-byte data.
+            # Change to correct view before byteswapping.
+            tensor.data.view(dtype=np.uint16).byteswap(inplace=True)
        else:
            # Handle other tensor types
            tensor.data.byteswap(inplace=True)
@@ -4588,20 +4588,31 @@ struct test_topk_moe: public test_case {
 struct test_sum : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute;
+    bool _use_permute;

    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
+        std::string v = VARS_TO_STR2(type, ne);
+        if (_use_permute) v += "," + VAR_TO_STR(permute);
+        return v;
    }

    test_sum(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
-        : type(type), ne(ne) {}
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            std::array<int64_t, 4> permute = {0, 0, 0, 0})
+        : type(type), ne(ne), permute(permute),
+            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_set_param(a);
        ggml_set_name(a, "a");

+        if (_use_permute) {
+            a = ggml_permute(ctx, a, permute[0], permute[1], permute[2], permute[3]);
+            ggml_set_name(a, "a_permuted");
+        }
+
        ggml_tensor * out = ggml_sum(ctx, a);
        ggml_set_name(out, "out");

@@ -6354,6 +6365,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        }
    }

+#if 0
+    {
+        // Test paths in OpenCL
+        std::vector<int> ns = {32, 64, 128, 256, 512, 1024, 4096};
+        std::vector<int> ks = {896, 1536, 4096};
+        for (auto n : ns) {
+            for (auto k : ks) {
+                test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 1024, n, k, {1, 1}, {1, 1}));
+            }
+        }
+    }
+#endif
+
 #if 1
    for (ggml_type type_a : base_types) {
        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
@@ -6724,6 +6748,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {

    test_cases.emplace_back(new test_sum());
    test_cases.emplace_back(new test_sum_rows());
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 2, 1, 3}));  // row-contiguous but non-contiguous
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 3, 2, 1}));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 1, 3, 2}));
    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, false));
    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, false, true));
    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, true));
@@ -6734,6 +6761,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }, { 1, 0, 2, 3 })); // sum dst not-contiguous
    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 256, 1, 1 }));
    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 }));
    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 }));
@@ -301,6 +301,30 @@ static void test_simple_grammar() {
            "0123",
        }
    );
+    test_schema(
+        "min 1 max 900719925474091",
+        // Schema
+        R"""({
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "maximum": 900719925474091
+        })""",
+        // Passing strings
+        {
+            "1",
+            "2",
+            "10",
+            "900719925474090",
+            "900719925474091",
+        },
+        // Failing strings
+        {
+            "0",
+            "01",
+            "900719925474092",
+            "9007199254740910",
+        }
+    );
    test_schema(
        "min -1 max 1",
        R"""({
@@ -30,6 +30,7 @@
 #define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"

 // vision-specific
+#define KEY_VISION_PROJ_TYPE    "clip.vision.projector_type" // for models with mixed modalities
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
 #define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
@@ -48,6 +49,7 @@
 #define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"

 // audio-specific
+#define KEY_AUDIO_PROJ_TYPE     "clip.audio.projector_type" // for models with mixed modalities
 #define KEY_A_NUM_MEL_BINS      "clip.audio.num_mel_bins"
 #define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"

@@ -2221,15 +2221,27 @@ struct clip_model_loader {
        // projector type
        std::string proj_type;
        {
+            // default key
            get_string(KEY_PROJ_TYPE, proj_type, false);
-            if (!proj_type.empty()) {
-                model.proj_type = clip_projector_type_from_string(proj_type);
+
+            // for models with mixed modalities
+            if (proj_type.empty()) {
+                if (modality == CLIP_MODALITY_VISION) {
+                    get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
+                } else if (modality == CLIP_MODALITY_AUDIO) {
+                    get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
+                } else {
+                    GGML_ABORT("unknown modality");
+                }
            }
+
+            model.proj_type = clip_projector_type_from_string(proj_type);
+
            if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
                throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
            }

-            // correct arch for multimodal models
+            // correct arch for multimodal models (legacy method)
            if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
                model.proj_type = modality == CLIP_MODALITY_VISION
                                    ? PROJECTOR_TYPE_QWEN25VL
@@ -3839,14 +3839,14 @@ struct server_context {

                                        {
                                            const auto token = slot.prompt.tokens[i];
-                                            const auto piece = common_token_to_piece(ctx, token);
+                                            const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]";
                                            ss0 << piece;
                                            st0 << std::setw(8) << token;
                                        }

                                        {
                                            const auto token = slot.task->tokens[i];
-                                            const auto piece = common_token_to_piece(ctx, token);
+                                            const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]";
                                            ss1 << piece;
                                            st1 << std::setw(8) << token;
                                        }
@@ -14,8 +14,7 @@
 	import { ChatSettingsFooter, ChatSettingsFields } from '$lib/components/app';
 	import * as Dialog from '$lib/components/ui/dialog';
 	import { ScrollArea } from '$lib/components/ui/scroll-area';
-	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
-	import { config, updateMultipleConfig, resetConfig } from '$lib/stores/settings.svelte';
+	import { config, updateMultipleConfig } from '$lib/stores/settings.svelte';
 	import { setMode } from 'mode-watcher';
 	import type { Component } from 'svelte';

@@ -267,16 +266,13 @@
 	}

 	function handleReset() {
-		resetConfig();
+		localConfig = { ...config() };

-		localConfig = { ...SETTING_CONFIG_DEFAULT };
-
-		setMode(SETTING_CONFIG_DEFAULT.theme as 'light' | 'dark' | 'system');
-		originalTheme = SETTING_CONFIG_DEFAULT.theme as string;
+		setMode(localConfig.theme as 'light' | 'dark' | 'system');
+		originalTheme = localConfig.theme as string;
 	}

 	function handleSave() {
-		// Validate custom JSON if provided
 		if (localConfig.custom && typeof localConfig.custom === 'string' && localConfig.custom.trim()) {
 			try {
 				JSON.parse(localConfig.custom);
@@ -1,4 +1,5 @@
 <script lang="ts">
+	import { RotateCcw } from '@lucide/svelte';
 	import { Checkbox } from '$lib/components/ui/checkbox';
 	import { Input } from '$lib/components/ui/input';
 	import Label from '$lib/components/ui/label/label.svelte';
@@ -6,6 +7,9 @@
 	import { Textarea } from '$lib/components/ui/textarea';
 	import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO } from '$lib/constants/settings-config';
 	import { supportsVision } from '$lib/stores/server.svelte';
+	import { getParameterInfo, resetParameterToServerDefault } from '$lib/stores/settings.svelte';
+	import { ParameterSyncService } from '$lib/services/parameter-sync';
+	import ParameterSourceIndicator from './ParameterSourceIndicator.svelte';
 	import type { Component } from 'svelte';

 	interface Props {
@@ -16,22 +20,77 @@
 	}

 	let { fields, localConfig, onConfigChange, onThemeChange }: Props = $props();
+
+	// Helper function to get parameter source info for syncable parameters
+	function getParameterSourceInfo(key: string) {
+		if (!ParameterSyncService.canSyncParameter(key)) {
+			return null;
+		}
+
+		return getParameterInfo(key);
+	}
 </script>

 {#each fields as field (field.key)}
 	<div class="space-y-2">
 		{#if field.type === 'input'}
-			<Label for={field.key} class="block text-sm font-medium">
-				{field.label}
-			</Label>
+			{@const paramInfo = getParameterSourceInfo(field.key)}
+			{@const currentValue = String(localConfig[field.key] ?? '')}
+			{@const propsDefault = paramInfo?.serverDefault}
+			{@const isCustomRealTime = (() => {
+				if (!paramInfo || propsDefault === undefined) return false;

-			<Input
-				id={field.key}
-				value={String(localConfig[field.key] ?? '')}
-				onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
-				placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
-				class="w-full md:max-w-md"
-			/>
+				// Apply same rounding logic for real-time comparison
+				const inputValue = currentValue;
+				const numericInput = parseFloat(inputValue);
+				const normalizedInput = !isNaN(numericInput)
+					? Math.round(numericInput * 1000000) / 1000000
+					: inputValue;
+				const normalizedDefault =
+					typeof propsDefault === 'number'
+						? Math.round(propsDefault * 1000000) / 1000000
+						: propsDefault;
+
+				return normalizedInput !== normalizedDefault;
+			})()}
+
+			<div class="flex items-center gap-2">
+				<Label for={field.key} class="text-sm font-medium">
+					{field.label}
+				</Label>
+				{#if isCustomRealTime}
+					<ParameterSourceIndicator />
+				{/if}
+			</div>
+
+			<div class="relative w-full md:max-w-md">
+				<Input
+					id={field.key}
+					value={currentValue}
+					oninput={(e) => {
+						// Update local config immediately for real-time badge feedback
+						onConfigChange(field.key, e.currentTarget.value);
+					}}
+					placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
+					class="w-full {isCustomRealTime ? 'pr-8' : ''}"
+				/>
+				{#if isCustomRealTime}
+					<button
+						type="button"
+						onclick={() => {
+							resetParameterToServerDefault(field.key);
+							// Trigger UI update by calling onConfigChange with the default value
+							const defaultValue = propsDefault ?? SETTING_CONFIG_DEFAULT[field.key];
+							onConfigChange(field.key, String(defaultValue));
+						}}
+						class="absolute top-1/2 right-2 inline-flex h-5 w-5 -translate-y-1/2 items-center justify-center rounded transition-colors hover:bg-muted"
+						aria-label="Reset to default"
+						title="Reset to default"
+					>
+						<RotateCcw class="h-3 w-3" />
+					</button>
+				{/if}
+			</div>
 			{#if field.help || SETTING_CONFIG_INFO[field.key]}
 				<p class="mt-1 text-xs text-muted-foreground">
 					{field.help || SETTING_CONFIG_INFO[field.key]}
@@ -59,14 +118,28 @@
 				(opt: { value: string; label: string; icon?: Component }) =>
 					opt.value === localConfig[field.key]
 			)}
+			{@const paramInfo = getParameterSourceInfo(field.key)}
+			{@const currentValue = localConfig[field.key]}
+			{@const propsDefault = paramInfo?.serverDefault}
+			{@const isCustomRealTime = (() => {
+				if (!paramInfo || propsDefault === undefined) return false;

-			<Label for={field.key} class="block text-sm font-medium">
-				{field.label}
-			</Label>
+				// For select fields, do direct comparison (no rounding needed)
+				return currentValue !== propsDefault;
+			})()}
+
+			<div class="flex items-center gap-2">
+				<Label for={field.key} class="text-sm font-medium">
+					{field.label}
+				</Label>
+				{#if isCustomRealTime}
+					<ParameterSourceIndicator />
+				{/if}
+			</div>

 			<Select.Root
 				type="single"
-				value={localConfig[field.key]}
+				value={currentValue}
 				onValueChange={(value) => {
 					if (field.key === 'theme' && value && onThemeChange) {
 						onThemeChange(value);
@@ -75,16 +148,34 @@
 					}
 				}}
 			>
-				<Select.Trigger class="w-full md:w-auto md:max-w-md">
-					<div class="flex items-center gap-2">
-						{#if selectedOption?.icon}
-							{@const IconComponent = selectedOption.icon}
-							<IconComponent class="h-4 w-4" />
-						{/if}
+				<div class="relative w-full md:w-auto md:max-w-md">
+					<Select.Trigger class="w-full">
+						<div class="flex items-center gap-2">
+							{#if selectedOption?.icon}
+								{@const IconComponent = selectedOption.icon}
+								<IconComponent class="h-4 w-4" />
+							{/if}

-						{selectedOption?.label || `Select ${field.label.toLowerCase()}`}
-					</div>
-				</Select.Trigger>
+							{selectedOption?.label || `Select ${field.label.toLowerCase()}`}
+						</div>
+					</Select.Trigger>
+					{#if isCustomRealTime}
+						<button
+							type="button"
+							onclick={() => {
+								resetParameterToServerDefault(field.key);
+								// Trigger UI update by calling onConfigChange with the default value
+								const defaultValue = propsDefault ?? SETTING_CONFIG_DEFAULT[field.key];
+								onConfigChange(field.key, String(defaultValue));
+							}}
+							class="absolute top-1/2 right-8 inline-flex h-5 w-5 -translate-y-1/2 items-center justify-center rounded transition-colors hover:bg-muted"
+							aria-label="Reset to default"
+							title="Reset to default"
+						>
+							<RotateCcw class="h-3 w-3" />
+						</button>
+					{/if}
+				</div>
 				<Select.Content>
 					{#if field.options}
 						{#each field.options as option (option.value)}
@@ -1,6 +1,8 @@
 <script lang="ts">
 	import { Button } from '$lib/components/ui/button';
 	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import { forceSyncWithServerDefaults } from '$lib/stores/settings.svelte';
+	import { RotateCcw } from '@lucide/svelte';

 	interface Props {
 		onReset?: () => void;
@@ -16,7 +18,9 @@
 	}

 	function handleConfirmReset() {
+		forceSyncWithServerDefaults();
 		onReset?.();
+
 		showResetDialog = false;
 	}

@@ -26,7 +30,13 @@
 </script>

 <div class="flex justify-between border-t border-border/30 p-6">
-	<Button variant="outline" onclick={handleResetClick}>Reset to default</Button>
+	<div class="flex gap-2">
+		<Button variant="outline" onclick={handleResetClick}>
+			<RotateCcw class="h-3 w-3" />
+
+			Reset to default
+		</Button>
+	</div>

 	<Button onclick={handleSave}>Save settings</Button>
 </div>
@@ -36,8 +46,9 @@
 		<AlertDialog.Header>
 			<AlertDialog.Title>Reset Settings to Default</AlertDialog.Title>
 			<AlertDialog.Description>
-				Are you sure you want to reset all settings to their default values? This action cannot be
-				undone and will permanently remove all your custom configurations.
+				Are you sure you want to reset all settings to their default values? This will reset all
+				parameters to the values provided by the server's /props endpoint and remove all your custom
+				configurations.
 			</AlertDialog.Description>
 		</AlertDialog.Header>
 		<AlertDialog.Footer>
@@ -0,0 +1,18 @@
+<script lang="ts">
+	import { Wrench } from '@lucide/svelte';
+	import { Badge } from '$lib/components/ui/badge';
+
+	interface Props {
+		class?: string;
+	}
+
+	let { class: className = '' }: Props = $props();
+</script>
+
+<Badge
+	variant="secondary"
+	class="h-5 bg-orange-100 px-1.5 py-0.5 text-xs text-orange-800 dark:bg-orange-900 dark:text-orange-200 {className}"
+>
+	<Wrench class="mr-1 h-3 w-3" />
+	Custom
+</Badge>
@@ -25,6 +25,7 @@ export { default as ChatScreen } from './chat/ChatScreen/ChatScreen.svelte';
 export { default as ChatSettingsDialog } from './chat/ChatSettings/ChatSettingsDialog.svelte';
 export { default as ChatSettingsFooter } from './chat/ChatSettings/ChatSettingsFooter.svelte';
 export { default as ChatSettingsFields } from './chat/ChatSettings/ChatSettingsFields.svelte';
+export { default as ParameterSourceIndicator } from './chat/ChatSettings/ParameterSourceIndicator.svelte';

 export { default as ChatSidebar } from './chat/ChatSidebar/ChatSidebar.svelte';
 export { default as ChatSidebarConversationItem } from './chat/ChatSidebar/ChatSidebarConversationItem.svelte';
@@ -154,9 +154,20 @@
 		return mutated ? tempDiv.innerHTML : html;
 	}

+	function normalizeMathDelimiters(text: string): string {
+		return text
+			.replace(/(^|[^\\])\\\[((?:\\.|[\s\S])*?)\\\]/g, (_, prefix: string, content: string) => {
+				return `${prefix}$$${content}$$`;
+			})
+			.replace(/(^|[^\\])\\\(((?:\\.|[\s\S])*?)\\\)/g, (_, prefix: string, content: string) => {
+				return `${prefix}$${content}$`;
+			});
+	}
+
 	async function processMarkdown(text: string): Promise<string> {
 		try {
-			const result = await processor().process(text);
+			const normalized = normalizeMathDelimiters(text);
+			const result = await processor().process(normalized);
 			const html = String(result);
 			const enhancedLinks = enhanceLinks(html);

@@ -0,0 +1,2 @@
+export const PRECISION_MULTIPLIER = 1000000;
+export const PRECISION_DECIMAL_PLACES = 6;
@@ -0,0 +1,135 @@
+import { describe, it, expect } from 'vitest';
+import { ParameterSyncService } from './parameter-sync';
+import type { ApiLlamaCppServerProps } from '$lib/types/api';
+
+describe('ParameterSyncService', () => {
+	describe('roundFloatingPoint', () => {
+		it('should fix JavaScript floating-point precision issues', () => {
+			// Test the specific values from the screenshot
+			const mockServerParams = {
+				top_p: 0.949999988079071,
+				min_p: 0.009999999776482582,
+				temperature: 0.800000011920929,
+				top_k: 40,
+				samplers: ['top_k', 'typ_p', 'top_p', 'min_p', 'temperature']
+			};
+
+			const result = ParameterSyncService.extractServerDefaults({
+				...mockServerParams,
+				// Add other required fields to match the API type
+				n_predict: 512,
+				seed: -1,
+				dynatemp_range: 0.0,
+				dynatemp_exponent: 1.0,
+				xtc_probability: 0.0,
+				xtc_threshold: 0.1,
+				typ_p: 1.0,
+				repeat_last_n: 64,
+				repeat_penalty: 1.0,
+				presence_penalty: 0.0,
+				frequency_penalty: 0.0,
+				dry_multiplier: 0.0,
+				dry_base: 1.75,
+				dry_allowed_length: 2,
+				dry_penalty_last_n: -1,
+				mirostat: 0,
+				mirostat_tau: 5.0,
+				mirostat_eta: 0.1,
+				stop: [],
+				max_tokens: -1,
+				n_keep: 0,
+				n_discard: 0,
+				ignore_eos: false,
+				stream: true,
+				logit_bias: [],
+				n_probs: 0,
+				min_keep: 0,
+				grammar: '',
+				grammar_lazy: false,
+				grammar_triggers: [],
+				preserved_tokens: [],
+				chat_format: '',
+				reasoning_format: '',
+				reasoning_in_content: false,
+				thinking_forced_open: false,
+				'speculative.n_max': 0,
+				'speculative.n_min': 0,
+				'speculative.p_min': 0.0,
+				timings_per_token: false,
+				post_sampling_probs: false,
+				lora: [],
+				top_n_sigma: 0.0,
+				dry_sequence_breakers: []
+			} as ApiLlamaCppServerProps['default_generation_settings']['params']);
+
+			// Check that the problematic floating-point values are rounded correctly
+			expect(result.top_p).toBe(0.95);
+			expect(result.min_p).toBe(0.01);
+			expect(result.temperature).toBe(0.8);
+			expect(result.top_k).toBe(40); // Integer should remain unchanged
+			expect(result.samplers).toBe('top_k;typ_p;top_p;min_p;temperature');
+		});
+
+		it('should preserve non-numeric values', () => {
+			const mockServerParams = {
+				samplers: ['top_k', 'temperature'],
+				max_tokens: -1,
+				temperature: 0.7
+			};
+
+			const result = ParameterSyncService.extractServerDefaults({
+				...mockServerParams,
+				// Minimal required fields
+				n_predict: 512,
+				seed: -1,
+				dynatemp_range: 0.0,
+				dynatemp_exponent: 1.0,
+				top_k: 40,
+				top_p: 0.95,
+				min_p: 0.05,
+				xtc_probability: 0.0,
+				xtc_threshold: 0.1,
+				typ_p: 1.0,
+				repeat_last_n: 64,
+				repeat_penalty: 1.0,
+				presence_penalty: 0.0,
+				frequency_penalty: 0.0,
+				dry_multiplier: 0.0,
+				dry_base: 1.75,
+				dry_allowed_length: 2,
+				dry_penalty_last_n: -1,
+				mirostat: 0,
+				mirostat_tau: 5.0,
+				mirostat_eta: 0.1,
+				stop: [],
+				n_keep: 0,
+				n_discard: 0,
+				ignore_eos: false,
+				stream: true,
+				logit_bias: [],
+				n_probs: 0,
+				min_keep: 0,
+				grammar: '',
+				grammar_lazy: false,
+				grammar_triggers: [],
+				preserved_tokens: [],
+				chat_format: '',
+				reasoning_format: '',
+				reasoning_in_content: false,
+				thinking_forced_open: false,
+				'speculative.n_max': 0,
+				'speculative.n_min': 0,
+				'speculative.p_min': 0.0,
+				timings_per_token: false,
+				post_sampling_probs: false,
+				lora: [],
+				top_n_sigma: 0.0,
+				dry_sequence_breakers: []
+			} as ApiLlamaCppServerProps['default_generation_settings']['params']);
+
+			expect(result.samplers).toBe('top_k;temperature');
+			expect(result.max_tokens).toBe(-1);
+			expect(result.temperature).toBe(0.7);
+		});
+	});
+});
@@ -0,0 +1,202 @@
+/**
+ * ParameterSyncService - Handles synchronization between server defaults and user settings
+ *
+ * This service manages the complex logic of merging server-provided default parameters
+ * with user-configured overrides, ensuring the UI reflects the actual server state
+ * while preserving user customizations.
+ *
+ * **Key Responsibilities:**
+ * - Extract syncable parameters from server props
+ * - Merge server defaults with user overrides
+ * - Track parameter sources (server, user, default)
+ * - Provide sync utilities for settings store integration
+ */
+
+import type { ApiLlamaCppServerProps } from '$lib/types/api';
+import { normalizeFloatingPoint } from '$lib/utils/precision';
+
+export type ParameterSource = 'default' | 'custom';
+export type ParameterValue = string | number | boolean;
+export type ParameterRecord = Record<string, ParameterValue>;
+
+export interface ParameterInfo {
+	value: string | number | boolean;
+	source: ParameterSource;
+	serverDefault?: string | number | boolean;
+	userOverride?: string | number | boolean;
+}
+
+export interface SyncableParameter {
+	key: string;
+	serverKey: string;
+	type: 'number' | 'string' | 'boolean';
+	canSync: boolean;
+}
+
+/**
+ * Mapping of webui setting keys to server parameter keys
+ * Only parameters that should be synced from server are included
+ */
+export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
+	{ key: 'temperature', serverKey: 'temperature', type: 'number', canSync: true },
+	{ key: 'top_k', serverKey: 'top_k', type: 'number', canSync: true },
+	{ key: 'top_p', serverKey: 'top_p', type: 'number', canSync: true },
+	{ key: 'min_p', serverKey: 'min_p', type: 'number', canSync: true },
+	{ key: 'dynatemp_range', serverKey: 'dynatemp_range', type: 'number', canSync: true },
+	{ key: 'dynatemp_exponent', serverKey: 'dynatemp_exponent', type: 'number', canSync: true },
+	{ key: 'xtc_probability', serverKey: 'xtc_probability', type: 'number', canSync: true },
+	{ key: 'xtc_threshold', serverKey: 'xtc_threshold', type: 'number', canSync: true },
+	{ key: 'typ_p', serverKey: 'typ_p', type: 'number', canSync: true },
+	{ key: 'repeat_last_n', serverKey: 'repeat_last_n', type: 'number', canSync: true },
+	{ key: 'repeat_penalty', serverKey: 'repeat_penalty', type: 'number', canSync: true },
+	{ key: 'presence_penalty', serverKey: 'presence_penalty', type: 'number', canSync: true },
+	{ key: 'frequency_penalty', serverKey: 'frequency_penalty', type: 'number', canSync: true },
+	{ key: 'dry_multiplier', serverKey: 'dry_multiplier', type: 'number', canSync: true },
+	{ key: 'dry_base', serverKey: 'dry_base', type: 'number', canSync: true },
+	{ key: 'dry_allowed_length', serverKey: 'dry_allowed_length', type: 'number', canSync: true },
+	{ key: 'dry_penalty_last_n', serverKey: 'dry_penalty_last_n', type: 'number', canSync: true },
+	{ key: 'max_tokens', serverKey: 'max_tokens', type: 'number', canSync: true },
+	{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true }
+];
+
+export class ParameterSyncService {
+	/**
+	 * Round floating-point numbers to avoid JavaScript precision issues
+	 */
+	private static roundFloatingPoint(value: ParameterValue): ParameterValue {
+		return normalizeFloatingPoint(value) as ParameterValue;
+	}
+
+	/**
+	 * Extract server default parameters that can be synced
+	 */
+	static extractServerDefaults(
+		serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null
+	): ParameterRecord {
+		if (!serverParams) return {};
+
+		const extracted: ParameterRecord = {};
+
+		for (const param of SYNCABLE_PARAMETERS) {
+			if (param.canSync && param.serverKey in serverParams) {
+				const value = (serverParams as unknown as Record<string, ParameterValue>)[param.serverKey];
+				if (value !== undefined) {
+					// Apply precision rounding to avoid JavaScript floating-point issues
+					extracted[param.key] = this.roundFloatingPoint(value);
+				}
+			}
+		}
+
+		// Handle samplers array conversion to string
+		if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
+			extracted.samplers = serverParams.samplers.join(';');
+		}
+
+		return extracted;
+	}
+
+	/**
+	 * Merge server defaults with current user settings
+	 * Returns updated settings that respect user overrides while using server defaults
+	 */
+	static mergeWithServerDefaults(
+		currentSettings: ParameterRecord,
+		serverDefaults: ParameterRecord,
+		userOverrides: Set<string> = new Set()
+	): ParameterRecord {
+		const merged = { ...currentSettings };
+
+		for (const [key, serverValue] of Object.entries(serverDefaults)) {
+			// Only update if user hasn't explicitly overridden this parameter
+			if (!userOverrides.has(key)) {
+				merged[key] = this.roundFloatingPoint(serverValue);
+			}
+		}
+
+		return merged;
+	}
+
+	/**
+	 * Get parameter information including source and values
+	 */
+	static getParameterInfo(
+		key: string,
+		currentValue: ParameterValue,
+		propsDefaults: ParameterRecord,
+		userOverrides: Set<string>
+	): ParameterInfo {
+		const hasPropsDefault = propsDefaults[key] !== undefined;
+		const isUserOverride = userOverrides.has(key);
+
+		// Simple logic: either using default (from props) or custom (user override)
+		const source: ParameterSource = isUserOverride ? 'custom' : 'default';
+
+		return {
+			value: currentValue,
+			source,
+			serverDefault: hasPropsDefault ? propsDefaults[key] : undefined, // Keep same field name for compatibility
+			userOverride: isUserOverride ? currentValue : undefined
+		};
+	}
+
+	/**
+	 * Check if a parameter can be synced from server
+	 */
+	static canSyncParameter(key: string): boolean {
+		return SYNCABLE_PARAMETERS.some((param) => param.key === key && param.canSync);
+	}
+
+	/**
+	 * Get all syncable parameter keys
+	 */
+	static getSyncableParameterKeys(): string[] {
+		return SYNCABLE_PARAMETERS.filter((param) => param.canSync).map((param) => param.key);
+	}
+
+	/**
+	 * Validate server parameter value
+	 */
+	static validateServerParameter(key: string, value: ParameterValue): boolean {
+		const param = SYNCABLE_PARAMETERS.find((p) => p.key === key);
+		if (!param) return false;
+
+		switch (param.type) {
+			case 'number':
+				return typeof value === 'number' && !isNaN(value);
+			case 'string':
+				return typeof value === 'string';
+			case 'boolean':
+				return typeof value === 'boolean';
+			default:
+				return false;
+		}
+	}
+
+	/**
+	 * Create a diff between current settings and server defaults
+	 */
+	static createParameterDiff(
+		currentSettings: ParameterRecord,
+		serverDefaults: ParameterRecord
+	): Record<string, { current: ParameterValue; server: ParameterValue; differs: boolean }> {
+		const diff: Record<
+			string,
+			{ current: ParameterValue; server: ParameterValue; differs: boolean }
+		> = {};
+
+		for (const key of this.getSyncableParameterKeys()) {
+			const currentValue = currentSettings[key];
+			const serverValue = serverDefaults[key];
+
+			if (serverValue !== undefined) {
+				diff[key] = {
+					current: currentValue,
+					server: serverValue,
+					differs: currentValue !== serverValue
+				};
+			}
+		}
+
+		return diff;
+	}
+}
@@ -125,6 +125,12 @@ class ServerStore {
 		return this._slotsEndpointAvailable;
 	}

+	get serverDefaultParams():
+		| ApiLlamaCppServerProps['default_generation_settings']['params']
+		| null {
+		return this._serverProps?.default_generation_settings?.params || null;
+	}
+
 	/**
 	 * Check if slots endpoint is available based on server properties and endpoint support
 	 */
@@ -273,3 +279,4 @@ export const supportedModalities = () => serverStore.supportedModalities;
 export const supportsVision = () => serverStore.supportsVision;
 export const supportsAudio = () => serverStore.supportsAudio;
 export const slotsEndpointAvailable = () => serverStore.slotsEndpointAvailable;
+export const serverDefaultParams = () => serverStore.serverDefaultParams;
@@ -33,11 +33,25 @@

 import { browser } from '$app/environment';
 import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+import { normalizeFloatingPoint } from '$lib/utils/precision';
+import { ParameterSyncService } from '$lib/services/parameter-sync';
+import { serverStore } from '$lib/stores/server.svelte';
+import { setConfigValue, getConfigValue, configToParameterRecord } from '$lib/utils/config-helpers';

 class SettingsStore {
 	config = $state<SettingsConfigType>({ ...SETTING_CONFIG_DEFAULT });
 	theme = $state<string>('auto');
 	isInitialized = $state(false);
+	userOverrides = $state<Set<string>>(new Set());
+
+	/**
+	 * Helper method to get server defaults with null safety
+	 * Centralizes the pattern of getting and extracting server defaults
+	 */
+	private getServerDefaults(): Record<string, string | number | boolean> {
+		const serverParams = serverStore.serverDefaultParams;
+		return serverParams ? ParameterSyncService.extractServerDefaults(serverParams) : {};
+	}

 	constructor() {
 		if (browser) {
@@ -67,14 +81,20 @@ class SettingsStore {

 		try {
 			const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
+
 			// Merge with defaults to prevent breaking changes
 			this.config = {
 				...SETTING_CONFIG_DEFAULT,
 				...savedVal
 			};
+
+			// Load user overrides
+			const savedOverrides = JSON.parse(localStorage.getItem('userOverrides') || '[]');
+			this.userOverrides = new Set(savedOverrides);
 		} catch (error) {
 			console.warn('Failed to parse config from localStorage, using defaults:', error);
 			this.config = { ...SETTING_CONFIG_DEFAULT };
+			this.userOverrides = new Set();
 		}
 	}

@@ -86,14 +106,30 @@ class SettingsStore {

 		this.theme = localStorage.getItem('theme') || 'auto';
 	}
-
 	/**
 	 * Update a specific configuration setting
 	 * @param key - The configuration key to update
 	 * @param value - The new value for the configuration key
 	 */
-	updateConfig<K extends keyof SettingsConfigType>(key: K, value: SettingsConfigType[K]) {
+	updateConfig<K extends keyof SettingsConfigType>(key: K, value: SettingsConfigType[K]): void {
 		this.config[key] = value;
+
+		if (ParameterSyncService.canSyncParameter(key as string)) {
+			const propsDefaults = this.getServerDefaults();
+			const propsDefault = propsDefaults[key as string];
+
+			if (propsDefault !== undefined) {
+				const normalizedValue = normalizeFloatingPoint(value);
+				const normalizedDefault = normalizeFloatingPoint(propsDefault);
+
+				if (normalizedValue === normalizedDefault) {
+					this.userOverrides.delete(key as string);
+				} else {
+					this.userOverrides.add(key as string);
+				}
+			}
+		}
+
 		this.saveConfig();
 	}

@@ -103,6 +139,26 @@ class SettingsStore {
 	 */
 	updateMultipleConfig(updates: Partial<SettingsConfigType>) {
 		Object.assign(this.config, updates);
+
+		const propsDefaults = this.getServerDefaults();
+
+		for (const [key, value] of Object.entries(updates)) {
+			if (ParameterSyncService.canSyncParameter(key)) {
+				const propsDefault = propsDefaults[key];
+
+				if (propsDefault !== undefined) {
+					const normalizedValue = normalizeFloatingPoint(value);
+					const normalizedDefault = normalizeFloatingPoint(propsDefault);
+
+					if (normalizedValue === normalizedDefault) {
+						this.userOverrides.delete(key);
+					} else {
+						this.userOverrides.add(key);
+					}
+				}
+			}
+		}
+
 		this.saveConfig();
 	}

@@ -114,6 +170,8 @@ class SettingsStore {

 		try {
 			localStorage.setItem('config', JSON.stringify(this.config));
+
+			localStorage.setItem('userOverrides', JSON.stringify(Array.from(this.userOverrides)));
 		} catch (error) {
 			console.error('Failed to save config to localStorage:', error);
 		}
@@ -185,6 +243,129 @@ class SettingsStore {
 	getAllConfig(): SettingsConfigType {
 		return { ...this.config };
 	}
+
+	/**
+	 * Initialize settings with props defaults when server properties are first loaded
+	 * This sets up the default values from /props endpoint
+	 */
+	syncWithServerDefaults(): void {
+		const serverParams = serverStore.serverDefaultParams;
+		if (!serverParams) {
+			console.warn('No server parameters available for initialization');
+
+			return;
+		}
+
+		const propsDefaults = this.getServerDefaults();
+
+		for (const [key, propsValue] of Object.entries(propsDefaults)) {
+			const currentValue = getConfigValue(this.config, key);
+
+			const normalizedCurrent = normalizeFloatingPoint(currentValue);
+			const normalizedDefault = normalizeFloatingPoint(propsValue);
+
+			if (normalizedCurrent === normalizedDefault) {
+				this.userOverrides.delete(key);
+				setConfigValue(this.config, key, propsValue);
+			} else if (!this.userOverrides.has(key)) {
+				setConfigValue(this.config, key, propsValue);
+			}
+		}
+
+		this.saveConfig();
+		console.log('Settings initialized with props defaults:', propsDefaults);
+		console.log('Current user overrides after sync:', Array.from(this.userOverrides));
+	}
+
+	/**
+	 * Clear all user overrides (for debugging)
+	 */
+	clearAllUserOverrides(): void {
+		this.userOverrides.clear();
+		this.saveConfig();
+		console.log('Cleared all user overrides');
+	}
+
+	/**
+	 * Reset all parameters to their default values (from props)
+	 * This is used by the "Reset to Default" functionality
+	 * Prioritizes server defaults from /props, falls back to webui defaults
+	 */
+	forceSyncWithServerDefaults(): void {
+		const propsDefaults = this.getServerDefaults();
+		const syncableKeys = ParameterSyncService.getSyncableParameterKeys();
+
+		for (const key of syncableKeys) {
+			if (propsDefaults[key] !== undefined) {
+				const normalizedValue = normalizeFloatingPoint(propsDefaults[key]);
+
+				setConfigValue(this.config, key, normalizedValue);
+			} else {
+				if (key in SETTING_CONFIG_DEFAULT) {
+					const defaultValue = getConfigValue(SETTING_CONFIG_DEFAULT, key);
+
+					setConfigValue(this.config, key, defaultValue);
+				}
+			}
+
+			this.userOverrides.delete(key);
+		}
+
+		this.saveConfig();
+	}
+
+	/**
+	 * Get parameter information including source for a specific parameter
+	 */
+	getParameterInfo(key: string) {
+		const propsDefaults = this.getServerDefaults();
+		const currentValue = getConfigValue(this.config, key);
+
+		return ParameterSyncService.getParameterInfo(
+			key,
+			currentValue ?? '',
+			propsDefaults,
+			this.userOverrides
+		);
+	}
+
+	/**
+	 * Reset a parameter to server default (or webui default if no server default)
+	 */
+	resetParameterToServerDefault(key: string): void {
+		const serverDefaults = this.getServerDefaults();
+
+		if (serverDefaults[key] !== undefined) {
+			const value = normalizeFloatingPoint(serverDefaults[key]);
+
+			this.config[key as keyof SettingsConfigType] =
+				value as SettingsConfigType[keyof SettingsConfigType];
+		} else {
+			if (key in SETTING_CONFIG_DEFAULT) {
+				const defaultValue = getConfigValue(SETTING_CONFIG_DEFAULT, key);
+
+				setConfigValue(this.config, key, defaultValue);
+			}
+		}
+
+		this.userOverrides.delete(key);
+		this.saveConfig();
+	}
+
+	/**
+	 * Get diff between current settings and server defaults
+	 */
+	getParameterDiff() {
+		const serverDefaults = this.getServerDefaults();
+		if (Object.keys(serverDefaults).length === 0) return {};
+
+		const configAsRecord = configToParameterRecord(
+			this.config,
+			ParameterSyncService.getSyncableParameterKeys()
+		);
+
+		return ParameterSyncService.createParameterDiff(configAsRecord, serverDefaults);
+	}
 }

 // Create and export the settings store instance
@@ -204,3 +385,11 @@ export const resetTheme = settingsStore.resetTheme.bind(settingsStore);
 export const resetAll = settingsStore.resetAll.bind(settingsStore);
 export const getConfig = settingsStore.getConfig.bind(settingsStore);
 export const getAllConfig = settingsStore.getAllConfig.bind(settingsStore);
+export const syncWithServerDefaults = settingsStore.syncWithServerDefaults.bind(settingsStore);
+export const forceSyncWithServerDefaults =
+	settingsStore.forceSyncWithServerDefaults.bind(settingsStore);
+export const getParameterInfo = settingsStore.getParameterInfo.bind(settingsStore);
+export const resetParameterToServerDefault =
+	settingsStore.resetParameterToServerDefault.bind(settingsStore);
+export const getParameterDiff = settingsStore.getParameterDiff.bind(settingsStore);
+export const clearAllUserOverrides = settingsStore.clearAllUserOverrides.bind(settingsStore);
@@ -0,0 +1,53 @@
+/**
+ * Type-safe configuration helpers
+ *
+ * Provides utilities for safely accessing and modifying configuration objects
+ * with dynamic keys while maintaining TypeScript type safety.
+ */
+
+import type { SettingsConfigType } from '$lib/types/settings';
+
+/**
+ * Type-safe helper to access config properties dynamically
+ * Provides better type safety than direct casting to Record
+ */
+export function setConfigValue<T extends SettingsConfigType>(
+	config: T,
+	key: string,
+	value: unknown
+): void {
+	if (key in config) {
+		(config as Record<string, unknown>)[key] = value;
+	}
+}
+
+/**
+ * Type-safe helper to get config values dynamically
+ */
+export function getConfigValue<T extends SettingsConfigType>(
+	config: T,
+	key: string
+): string | number | boolean | undefined {
+	const value = (config as Record<string, unknown>)[key];
+	return value as string | number | boolean | undefined;
+}
+
+/**
+ * Convert a SettingsConfigType to a ParameterRecord for specific keys
+ * Useful for parameter synchronization operations
+ */
+export function configToParameterRecord<T extends SettingsConfigType>(
+	config: T,
+	keys: string[]
+): Record<string, string | number | boolean> {
+	const record: Record<string, string | number | boolean> = {};
+
+	for (const key of keys) {
+		const value = getConfigValue(config, key);
+		if (value !== undefined) {
+			record[key] = value;
+		}
+	}
+
+	return record;
+}
@@ -0,0 +1,25 @@
+/**
+ * Floating-point precision utilities
+ *
+ * Provides functions to normalize floating-point numbers for consistent comparison
+ * and display, addressing JavaScript's floating-point precision issues.
+ */
+
+import { PRECISION_MULTIPLIER } from '$lib/constants/precision';
+
+/**
+ * Normalize floating-point numbers for consistent comparison
+ * Addresses JavaScript floating-point precision issues (e.g., 0.949999988079071 → 0.95)
+ */
+export function normalizeFloatingPoint(value: unknown): unknown {
+	return typeof value === 'number'
+		? Math.round(value * PRECISION_MULTIPLIER) / PRECISION_MULTIPLIER
+		: value;
+}
+
+/**
+ * Type-safe version that only accepts numbers
+ */
+export function normalizeNumber(value: number): number {
+	return Math.round(value * PRECISION_MULTIPLIER) / PRECISION_MULTIPLIER;
+}
@@ -9,7 +9,7 @@
 	} from '$lib/stores/chat.svelte';
 	import * as Sidebar from '$lib/components/ui/sidebar/index.js';
 	import { serverStore } from '$lib/stores/server.svelte';
-	import { config } from '$lib/stores/settings.svelte';
+	import { config, settingsStore } from '$lib/stores/settings.svelte';
 	import { ModeWatcher } from 'mode-watcher';
 	import { Toaster } from 'svelte-sonner';
 	import { goto } from '$app/navigation';
@@ -95,6 +95,15 @@
 		serverStore.fetchServerProps();
 	});

+	// Sync settings when server props are loaded
+	$effect(() => {
+		const serverProps = serverStore.serverProps;
+
+		if (serverProps?.default_generation_settings?.params) {
+			settingsStore.syncWithServerDefaults();
+		}
+	});
+
 	// Monitor API key changes and redirect to error page if removed or changed when required
 	$effect(() => {
 		const apiKey = config().apiKey;
Author	SHA1	Message	Date
Olivier Chafik	79967ec596	grammar : use int64_t to avoid int overflows in int schema to grammar conversion logic (#16626 )	2025-10-17 08:59:31 +03:00
GittyBurstein	ceff6bb253	SYCL SET operator optimized for F32 tensors (#16350 ) * SYCL/SET: implement operator + wire-up; docs/ops updates; element_wise & ggml-sycl changes * sycl(SET): re-apply post-rebase; revert manual docs/ops.md; style cleanups * move SET op to standalone file, GPU-only implementation * Update SYCL SET operator for F32 * ci: fix editorconfig issues (LF endings, trailing spaces, final newline) * fixed ggml-sycl.cpp --------- Co-authored-by: Gitty Burstein <gitty@example.com>	2025-10-17 10:36:40 +08:00
Xuan-Son Nguyen	1bb4f43380	mtmd : support home-cooked Mistral Small Omni (#14928 )	2025-10-16 19:00:31 +02:00
Pascal	683fa6ba4e	fix: added a normalization step for MathJax-style \[\] and \(\) delimiters (#16599 ) * fix: added a normalization step for MathJax-style \[\] and \(\) delimiters So inline and block equations are converted before KaTeX rendering, enabling proper display of model-generated LaTeX in the WebUI * chore: update webui build output	2025-10-16 16:28:41 +02:00
GittyBurstein	b22572e97d	sycl : add ARANGE operator (#16362 ) * SYCL: update element-wise ops and presets * clean arange * Re-trigger CI --------- Co-authored-by: Gitty Burstein <gitty@example.com>	2025-10-16 15:26:21 +02:00
Chenguang Li	7a50cf388a	CANN: format code using .clang-format (#15863 ) This commit applies .clang-format rules to all source files under the ggml-cann directory to ensure consistent coding style and readability. The .clang-format option `SortIncludes: false` has been set to disable automatic reordering of include directives. No functional changes are introduced. Co-authored-by: hipudding <huafengchun@gmail.com>	2025-10-16 16:41:11 +08:00
takasurazeem	6f5d924637	common : Update the docs on -t --threads (#16236 ) * Update the docs on -t --threads * Revert "Update the docs on -t --threads" This reverts commit `eba97345e2`. * docs: clarify -t/--threads parameter uses CPU threads and defaults to all available cores * Update arg.cpp	2025-10-16 08:11:33 +03:00
takuya kodama	adc9b60f19	ggml-cpu: replace putenv with setenv for const-correctness (#16573 ) ## Why it failed When compiling with strict compiler flags (-Wwrite-strings -Werror=discarded-qualifiers), the build fails with the following error: ``` cmake \ -S . \ -B ../llama.cpp.build \ --preset=x64-linux-gcc-debug \ -DCMAKE_INSTALL_PREFIX=/tmp/local \ -DCMAKE_C_FLAGS="-Wwrite-strings -Werror=discarded-qualifiers" && \ cmake --build ../llama.cpp.build/ ... /home/otegami/work/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c: In function ‘ggml_cpu_init’: /home/otegami/work/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c:3572:24: error: passing argument 1 of ‘putenv’ discards ‘const’ qualifier from pointer target type [-Werror=discarded-qualifiers] 3572 \| putenv("KMP_BLOCKTIME=200"); // 200ms \| ^~~~~~~~~~~~~~~~~~~ In file included from /home/otegami/work/cpp/llama.cpp/ggml/src/./ggml-impl.h:10, from /home/otegami/work/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h:6, from /home/otegami/work/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h:3, from /home/otegami/work/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c:6: /usr/include/stdlib.h:786:26: note: expected ‘char ’ but argument is of type ‘const char ’ 786 \| extern int putenv (char __string) __THROW __nonnull ((1)); \| ~~~~~~^~~~~~~~ cc1: some warnings being treated as errors ninja: build stopped: subcommand failed. ``` The issue is that putenv() expects a non-const char but receives a string literal (const char ). ## How to fix This PR replaces putenv("KMP_BLOCKTIME=200") with setenv("KMP_BLOCKTIME", "200", 0). Benefits of setenv(): - Accepts const char parameters (no qualifier warnings) - Makes copies of the strings (safer memory handling) - The third parameter (0) ensures we don't overwrite if already set	2025-10-16 08:10:32 +03:00
yael-works	ee50ee1ead	SYCL: Add GGML_OP_MEAN operator support (#16009 ) * SYCL: Add GGML_OP_MEAN operator support * SYCL: Fix formatting for GGML_OP_MEAN case * Update ggml/src/ggml-sycl/ggml-sycl.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-10-16 12:21:28 +08:00
Aleksei Nikiforov	7adc79c032	gguf-py : add support for endian conversion of BF16 data (#16594 ) BF16 requires special handling in this script while it's a 2-bytes data, but view is 1-byte by default. Switch to correct view before attempting byteswapping. With this change correctly byteswapping models like Meta-Llama-3-8B-Instruct-bf16-GGUF should be possible.	2025-10-15 22:43:08 +02:00
safranowith	466c1911ab	cpu : add FLOOR, CEIL, ROUND and TRUNC unary operators (#16083 ) * CPU: Add support for FLOOR,CEIL,ROUND and TRUNC unary operators - Added the operators to unary op enum - Implemented API functions - Implemented forward and unary-op logic in CPU backend - Updated ggml_get_n_tasks - Updated operators names array and static_assert - Updated docs and enabled automatic tests * docs: add documentation for ggml_trunc and ggml_trunc_inplace in ggml.h * chore: remove trailing whitespace from ggml.h * Remove unresolved merge markers * Apply review suggestions: cleanup formatting, enum order and leftover artifacts * Regenerate ops.md using create_ops_docs.py	2025-10-15 21:24:51 +02:00
lhez	0cb7a0683b	opencl: add q8_0 mm support (#16469 ) * opencl: add mm_q8_0_f32 * opencl: fix data loading for incomplete tile * opencl: use q8_0 mm for larger matrix * opencl: add some tests to cover the path	2025-10-15 10:51:04 -07:00
lhez	d93f8439b0	opencl: fix FA for f32 (#16584 )	2025-10-15 10:48:28 -07:00
Aleksander Grygier	f9fb33f263	Add server-driven parameter defaults and syncing (#16515 )	2025-10-15 16:22:20 +02:00
Sam/Samuel	f4ce81c45e	metal: optimise `GGML_OP_SUM` (#16559 ) * optimise GGML_OP_SUM * add non-contiguous tests by permuting the input * change tests to require full contiguity of OP_SUM * cuda : add check GGML_OP_SUM --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-10-15 17:05:56 +03:00
Georgi Gerganov	17304cbcc1	server : fix img token logs (#16595 )	2025-10-15 16:53:12 +03:00