model : make rope_yarn_log_mul optional for deepseek2 (#14896 )

* make rope_yarn_log_mul optional for deepseek2 * default rope_yarn_log_mul = 0.0f
llama : fix kq_scale for the attention layers of PLaMo2 (#14892 )
2026-07-01 01:57:43 +02:00 · 2025-07-27 11:18:37 +03:00 · 2025-07-27 09:38:44 +02:00 · 2025-07-27 09:36:43 +08:00 · 2025-07-27 00:28:14 +02:00 · 2025-07-26 17:56:18 +08:00
22 changed files with 16291 additions and 13866 deletions
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=24.04

 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=6.3
-ARG AMDGPU_VERSION=6.3
+ARG ROCM_VERSION=6.4
+ARG AMDGPU_VERSION=6.4

 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@@ -3791,7 +3791,7 @@ class Plamo2Model(TextModel):
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
        self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
-        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
+        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))

        # Mamba parameters
        self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
@@ -3802,7 +3802,7 @@ class Plamo2Model(TextModel):
        self.gguf_writer.add_ssm_group_count(0)

        # MLP feed forward parameters (for attention layers)
-        self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
+        self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
        self.gguf_writer.add_file_type(self.ftype)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -2,6 +2,11 @@

 List of GGML operations and backend support status.

+## How to add a backend to this table:
+
+1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)
+2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`
+
 Legend:
 - ✅ Fully supported by this backend
 - 🟡 Partially supported by this backend
@@ -18,7 +23,8 @@ Legend:
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ |
 |                            CLAMP | ❌ | ✅ | ✅ | 🟡 |
 |                           CONCAT | ❌ | ✅ | 🟡 | ✅ |
-|                             CONT | ❌ | ✅ | 🟡 | ✅ |
+|                             CONT | ❌ | ✅ | ✅ | ✅ |
+|                          CONV_2D | ❌ | ✅ | ❌ | ❌ |
 |                       CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
 |                CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
@@ -30,7 +36,7 @@ Legend:
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
 |                              DIV | ❌ | ✅ | ✅ | 🟡 |
 |                              DUP | ❌ | ✅ | 🟡 | 🟡 |
-|                              ELU | ❌ | ✅ | ❌ | 🟡 |
+|                              ELU | ❌ | ✅ | 🟡 | 🟡 |
 |                              EXP | ❌ | ✅ | 🟡 | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
@@ -66,14 +72,16 @@ Legend:
 |                      REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
 |                    RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
-|                     RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
+|                     RMS_NORM_MUL | ❌ | ❌ | ❌ | ✅ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ❌ |
+|                             ROLL | ❌ | ✅ | ❌ | ❌ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ |
 |                        ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
 |                        RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
 |                            SCALE | ❌ | ✅ | ✅ | ✅ |
 |                              SET | ❌ | ✅ | ❌ | ✅ |
-|                         SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
+|                         SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 |
 |                              SGN | ❌ | ✅ | 🟡 | ❌ |
 |                          SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
 |                             SILU | ❌ | ✅ | 🟡 | 🟡 |
@@ -77,6 +77,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    for (int i = 0; i < final_dims; i++) {
        acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
    }
+    size_t elem_offset = offset / ggml_element_size(tensor);
+    acl_storage_len += elem_offset;

    // Reverse ne and stride.
    std::reverse(acl_ne, acl_ne + final_dims);
@@ -84,7 +86,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,

    aclTensor* acl_tensor = aclCreateTensor(
        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
-        offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
+        elem_offset, format, &acl_storage_len, 1,
        tensor->data);

    return acl_tensor;
@@ -99,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
    }
 }

-void ggml_cann_unary_op(
+void ggml_cann_op_unary(
    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
    ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
@@ -111,6 +111,42 @@ void ggml_cann_unary_op(
    ggml_cann_release_resources(ctx, acl_src, acl_dst);
 }

+void ggml_cann_op_unary_gated(
+    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
+    ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_1(dst));
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
+    if(src1) {
+        GGML_ASSERT(ggml_is_contiguous_1(src1));
+        GGML_ASSERT(src0->type == src1->type);
+
+        acl_src0 = ggml_cann_create_tensor(src0);
+        acl_src1 = ggml_cann_create_tensor(src1);
+    } else {
+        int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
+        size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
+        acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
+        acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
+        if (swapped) {
+            std::swap(acl_src0, acl_src1);
+        }
+    }
+
+    unary_op(ctx, acl_src0, acl_dst);
+    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
+
+    ggml_cann_release_resources(ctx, acl_src0, acl_dst);
+    if(src1)
+        ggml_cann_release_resources(ctx, acl_src1);
+}
+
 /**
 * @brief Repeats elements of a tensor along each dimension according to the
 * specified repeat array.
@@ -1098,7 +1098,7 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
 */
 template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
-    void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];

    aclTensor* acl_src = ggml_cann_create_tensor(src);
@@ -1109,49 +1109,125 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 }

 /**
- * @brief   Applies a unary operation to a ggml tensor using the CANN backend.
+ * @brief Applies a unary operation to a ggml tensor using the CANN backend.
 *
- * @details This function performs a unary operation on the input tensor using
- * a user-provided lambda or callable object `unary_op`, which accepts the CANN
- * context and two ACL tensors (source and destination). Internally, this function
- * creates ACL representations of the ggml tensors and invokes the unary operation.
- * The result is stored in the destination tensor `dst`. This utility abstracts the
- * common boilerplate of tensor conversion and cleanup when implementing unary ops.
+ * @details This function applies a unary operation to the input tensor using
+ * a user-provided lambda or callable `unary_op`. The lambda receives the
+ * CANN backend context and two ACL tensors: the source and the destination.
 *
- * @param unary_op A callable that performs the unary operation using CANN APIs.
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the result will be stored.
- *            The source tensor is retrieved from `dst->src[0]`.
+ * Internally, this function handles the conversion from GGML tensors to ACL tensors,
+ * calls the provided unary op, and manages resource cleanup. The input is assumed
+ * to be `dst->src[0]`, and the result is written to `dst`.
+ *
+ * This utility simplifies writing unary op wrappers by abstracting tensor preparation.
+ *
+ * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
+ * @param ctx The CANN context for operation execution.
+ * @param dst The destination ggml_tensor where the result will be stored.
+ *            The input tensor is assumed to be `dst->src[0]`.
+ *
+ * @see GGML_CANN_CALL_OP_UNARY
 */
-void ggml_cann_unary_op(
+void ggml_cann_op_unary(
    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
    ggml_backend_cann_context& ctx, ggml_tensor* dst);

 /**
- * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
+ * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
 *
- * This macro defines an inline lambda wrapping a specific ACL operation name,
- * and passes it to the templated ggml_cann_unary_op function. It simplifies
- * calling unary ops by hiding the lambda boilerplate.
+ * @details This function performs a gated activation such as GEGLU or ReGLU.
+ * It supports two input modes:
+ *
+ * 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
+ *    These are used directly as the value and gate tensors.
+ *
+ * 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
+ *    contain a concatenation of value and gate along the first dimension. This tensor
+ *    will be split into two equal halves to form the value and gate inputs.
+ *
+ * The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
+ * then multiplies the result in-place with the gate tensor:
 *
- * Internally, the lambda will call:
 * @code
- * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
+ * dst = unary_op(value) * gate;
 * @endcode
 *
+ * The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
+ * order of value/gate in the packed input case.
+ *
+ * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
+ *                 It receives (ctx, acl_value_tensor, acl_output_tensor).
+ * @param ctx      The CANN context used for execution.
+ * @param dst      The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
+ *
+ * @see GGML_CANN_CALL_OP_UNARY_GATED
+ */
+void ggml_cann_op_unary_gated(
+    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
+    ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
+ *
+ * This macro wraps the specified ACLNN unary operator name into a lambda expression,
+ * and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
+ * unary ops in the CANN backend.
+ *
+ * Internally, this macro expands to a lambda like:
+ * @code
+ * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
+ *     GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
+ * };
+ * @endcode
+ *
+ * This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
+ *
 * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
 *
- * @see ggml_cann_unary_op
+ * @see ggml_cann_op_unary
 * @see GGML_CANN_CALL_ACLNN_OP
 */
-#define GGML_CANN_CALL_UNARY_OP(OP_NAME)                              \
+#define GGML_CANN_CALL_OP_UNARY(OP_NAME)                              \
    do {                                                              \
        auto lambda = [](ggml_backend_cann_context& ctx,              \
            aclTensor* acl_src,                                       \
            aclTensor* acl_dst) {                                     \
            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
        };                                                            \
-        ggml_cann_unary_op(lambda, ctx, dst);                         \
+        ggml_cann_op_unary(lambda, ctx, dst);                         \
    }                                                                 \
    while (0)
+
+/**
+ * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
+ *
+ * This macro wraps the specified ACLNN unary operator name into a lambda expression,
+ * and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
+ * executing gated unary ops in the CANN backend.
+ *
+ * Internally, this macro expands to a lambda like:
+ * @code
+ * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
+ *     GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
+ * };
+ * @endcode
+ *
+ * This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
+ *
+ * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
+ *
+ * @see ggml_cann_op_unary_gated
+ * @see GGML_CANN_CALL_ACLNN_OP
+ */
+#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                        \
+    do {                                                              \
+        auto lambda = [](ggml_backend_cann_context& ctx,              \
+            aclTensor* acl_src,                                       \
+            aclTensor* acl_dst) {                                     \
+            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
+        };                                                            \
+        ggml_cann_op_unary_gated(lambda, ctx, dst);                   \
+    }                                                                 \
+    while (0)
+
 #endif  // CANN_ACLNN_OPS
@@ -1681,16 +1681,18 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(dst)) {
                case GGML_UNARY_OP_ABS:
-                    GGML_CANN_CALL_UNARY_OP(Abs);
+                    GGML_CANN_CALL_OP_UNARY(Abs);
                    break;
                case GGML_UNARY_OP_NEG:
-                    GGML_CANN_CALL_UNARY_OP(Neg);
+                    GGML_CANN_CALL_OP_UNARY(Neg);
                    break;
                case GGML_UNARY_OP_GELU:
-                    GGML_CANN_CALL_UNARY_OP(Gelu);
+                case GGML_UNARY_OP_GELU_ERF:
+                    // aclnnGelu internally uses the erf-based approximation.
+                    GGML_CANN_CALL_OP_UNARY(Gelu);
                    break;
                case GGML_UNARY_OP_SILU:
-                    GGML_CANN_CALL_UNARY_OP(Silu);
+                    GGML_CANN_CALL_OP_UNARY(Silu);
                    break;
                case GGML_UNARY_OP_GELU_QUICK: {
                    auto lambda = [](ggml_backend_cann_context& ctx,
@@ -1698,31 +1700,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                        aclTensor* acl_dst) {
                        GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
                    };
-                    ggml_cann_unary_op(lambda, ctx, dst);
+                    ggml_cann_op_unary(lambda, ctx, dst);
                } break;
                case GGML_UNARY_OP_TANH:
-                    GGML_CANN_CALL_UNARY_OP(Tanh);
+                    GGML_CANN_CALL_OP_UNARY(Tanh);
                    break;
                case GGML_UNARY_OP_RELU:
-                    GGML_CANN_CALL_UNARY_OP(Relu);
+                    GGML_CANN_CALL_OP_UNARY(Relu);
                    break;
                case GGML_UNARY_OP_SIGMOID:
-                    GGML_CANN_CALL_UNARY_OP(Sigmoid);
+                    GGML_CANN_CALL_OP_UNARY(Sigmoid);
                    break;
                case GGML_UNARY_OP_HARDSIGMOID:
-                    GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
+                    GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
                    break;
                case GGML_UNARY_OP_HARDSWISH:
-                    GGML_CANN_CALL_UNARY_OP(Hardswish);
+                    GGML_CANN_CALL_OP_UNARY(Hardswish);
                    break;
                case GGML_UNARY_OP_EXP:
-                    GGML_CANN_CALL_UNARY_OP(Exp);
+                    GGML_CANN_CALL_OP_UNARY(Exp);
                    break;
                case GGML_UNARY_OP_ELU:
                    ggml_cann_elu(ctx, dst);
                    break;
                case GGML_UNARY_OP_SGN:
-                    GGML_CANN_CALL_UNARY_OP(Sign);
+                    GGML_CANN_CALL_OP_UNARY(Sign);
                    break;
                case GGML_UNARY_OP_STEP:
                    ggml_cann_step(ctx, dst);
@@ -1731,6 +1733,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                    return false;
            }
            break;
+        case GGML_OP_GLU:
+            switch (ggml_get_glu_op(dst)) {
+                case GGML_GLU_OP_REGLU:
+                    GGML_CANN_CALL_OP_UNARY_GATED(Relu);
+                    break;
+                case GGML_GLU_OP_GEGLU:
+                case GGML_GLU_OP_GEGLU_ERF:
+                    // aclnnGelu internally uses the erf-based approximation.
+                    GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
+                    break;
+                case GGML_GLU_OP_SWIGLU:
+                    GGML_CANN_CALL_OP_UNARY_GATED(Silu);
+                    break;
+                case GGML_GLU_OP_GEGLU_QUICK: {
+                    auto lambda = [](ggml_backend_cann_context& ctx,
+                        aclTensor* acl_src,
+                        aclTensor* acl_dst) {
+                        GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+                    };
+                    ggml_cann_op_unary_gated(lambda, ctx, dst);
+                } break;
+                default:
+                    return false;
+            }
+            break;
        case GGML_OP_NORM:
            ggml_cann_norm(ctx, dst);
            break;
@@ -1773,7 +1800,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_binary_op<aclnn_mul>(ctx, dst);
            break;
        case GGML_OP_SQRT:
-            GGML_CANN_CALL_UNARY_OP(Sqrt);
+            GGML_CANN_CALL_OP_UNARY(Sqrt);
            break;
        case GGML_OP_CLAMP:
            ggml_cann_clamp(ctx, dst);
@@ -1818,16 +1845,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_argmax(ctx, dst);
            break;
        case GGML_OP_COS:
-            ggml_cann_unary_op<aclnn_cos>(ctx, dst);
+            ggml_cann_op_unary<aclnn_cos>(ctx, dst);
            break;
        case GGML_OP_SIN:
-            ggml_cann_unary_op<aclnn_sin>(ctx, dst);
+            ggml_cann_op_unary<aclnn_sin>(ctx, dst);
            break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            ggml_cann_conv_transpose_1d(ctx, dst);
            break;
        case GGML_OP_LOG:
-            GGML_CANN_CALL_UNARY_OP(Log);
+            GGML_CANN_CALL_OP_UNARY(Log);
            break;
        case GGML_OP_MEAN:
            ggml_cann_mean(ctx, dst);
@@ -2101,10 +2128,23 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_SGN:
                case GGML_UNARY_OP_STEP:
+                case GGML_UNARY_OP_GELU_ERF:
                    return true;
                default:
                    return false;
            }
+        case GGML_OP_GLU:
+            switch (ggml_get_glu_op(op)) {
+                case GGML_GLU_OP_REGLU:
+                case GGML_GLU_OP_GEGLU:
+                case GGML_GLU_OP_SWIGLU:
+                case GGML_GLU_OP_GEGLU_ERF:
+                case GGML_GLU_OP_GEGLU_QUICK:
+                    return true;
+                default:
+                    return false;
+            }
+            break;
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
                case GGML_TYPE_F16:
@@ -56,7 +56,7 @@
 #define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 0x803)  // Tonga, Fiji, Polaris, minimum for fast fp16
 #define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 0x900)  // Vega56/64, minimum for fp16 dual issue
 #define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 0x906)  // MI50/Radeon VII, minimum for dp4a
-#define GGML_CUDA_CC_CDNA       (GGML_CUDA_CC_OFFSET_AMD + 0x908)  // MI100, minimum for MFMA, acc registers
+#define GGML_CUDA_CC_CDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x908)  // MI100, minimum for MFMA, acc registers
 #define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
 #define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300

@@ -72,8 +72,9 @@
 #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
 #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
 #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
-#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
+#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)

 // Moore Threads
 #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
@@ -226,6 +227,10 @@ typedef float2 dfloat2;
 #define FP16_MMA_AVAILABLE
 #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))

+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
+#define AMD_MFMA_AVAILABLE
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
+
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 #define NEW_MMA_AVAILABLE
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -288,6 +293,11 @@ static bool fp32_mma_hardware_available(const int cc) {
    return GGML_CUDA_CC_IS_CDNA(cc);
 }

+// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
+static bool amd_mfma_available(const int cc) {
+    return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
+}
+
 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 static bool new_mma_available(const int cc) {
    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
@@ -1330,14 +1330,16 @@ static __global__ void flash_attn_ext_f16(
         ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
 #else
    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
-    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
-    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
-    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
-    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
-    GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
-    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
-    GGML_UNUSED(nb22); GGML_UNUSED(nb23);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
+    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
+    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
+    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
+    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 }
@@ -37,16 +37,16 @@ static __global__ void flash_attn_tile_ext_f32(
 #endif // FP16_MMA_AVAILABLE
    if (use_logit_softcap && !(D == 128 || D == 256)) {
        GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
-        GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
-        GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+        GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
+        GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
        GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
-        GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
-        GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
-        GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-        GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
-        GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
-        GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
-        GGML_UNUSED(nb23);
+        GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
+        GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+        GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
+        GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
+        GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
+        GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
+        GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
        NO_DEVICE_CODE;
        return;
    }
@@ -282,16 +282,16 @@ static __global__ void flash_attn_tile_ext_f32(
    }
 #else
    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
-    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
-    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
+    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
-    GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-    GGML_UNUSED(nb31); GGML_UNUSED(nb32);
    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
+    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
    NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }
@@ -329,16 +329,16 @@ static __global__ void flash_attn_vec_ext_f16(
    }
 #else
    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
-    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
-    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
+    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
-    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
-    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne32);
-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
-    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
-    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
-    GGML_UNUSED(nb23);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
+    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
+    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
+    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }
@@ -12,7 +12,8 @@
 // The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile.
 // All matrix tiles have ne physical 32 bit elements per warp.
 //
-// As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
+// As described in the PTX documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
+// The API in this file also assumes that the pointers for load_generic are aligned to 16 bytes, unaligned pointers are considered undefined behavior.

 #include "common.cuh"

@@ -66,7 +67,44 @@ namespace ggml_cuda_mma {
    struct tile {
        static constexpr int I  = I_;
        static constexpr int J  = J_;
-        static constexpr int ne = I * J / WARP_SIZE;
+
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+        static constexpr int ne = I * J / 64;
+        T x[ne] = {0};
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
+                return threadIdx.x % 16;
+            } else if constexpr (I == 16 && J == 8) {
+                return threadIdx.x % 16;
+            } else if constexpr (I == 32 && J == 4) {
+                return threadIdx.x % 32;
+            } else if constexpr (I == 16 && J == 16) {
+                return 4 * (threadIdx.x / 16) + l;
+            } else if constexpr (I == 32 && J == 32) {
+                return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
+            } else {
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
+                return (2 * ((threadIdx.x / 16) % 2) + l);
+            } else if constexpr (I == 16 && J == 8) {
+                return 2 * (threadIdx.x / 16) + l;
+            } else if constexpr (I == 32 && J == 4) {
+                return 2 * (threadIdx.x / 32) + l;
+            } else if constexpr (I == 16 && J == 16) {
+                return threadIdx.x % 16;
+            } else if constexpr (I == 32 && J == 32) {
+                return threadIdx.x % 32;
+            } else {
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
+            }
+        }
+#else
+        static constexpr int ne = I * J / 32;
        T x[ne] = {0};

        static __device__ __forceinline__ int get_i(const int l) {
@@ -94,6 +132,7 @@ namespace ggml_cuda_mma {
                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
    };

    template <int I_, int J_>
@@ -148,10 +187,23 @@ namespace ggml_cuda_mma {

    template <int I, int J, typename T>
    static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
+#if defined(AMD_MFMA_AVAILABLE)
+        if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
+#pragma unroll
+            for (int l = 0; l < t.ne; ++l) {
+                t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
+            }
+        } else {
+            int64_t * xi = (int64_t *) t.x;
+            const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
+            xi[0] = xs[0];
+        }
+#else
 #pragma unroll
        for (int l = 0; l < t.ne; ++l) {
            t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
        }
+#endif // defined(AMD_MFMA_AVAILABLE)
    }

    template <typename T>
@@ -186,7 +238,7 @@ namespace ggml_cuda_mma {
    template <typename T>
    static __device__ __forceinline__ void load_ldmatrix(
            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
-#ifdef NEW_MMA_AVAILABLE
+#if defined(NEW_MMA_AVAILABLE)
        int * xi = (int * ) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
        asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
@@ -393,4 +445,60 @@ namespace ggml_cuda_mma {
        NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
    }
+
+    static __device__ __forceinline__ void mma(
+            tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
+#if defined(AMD_MFMA_AVAILABLE)
+        using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+        int32x4_t * acc = (int32x4_t *) D.x;
+#if defined(CDNA3)
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
+                                                       ((int64_t *) B.x)[0],
+                                                       acc[0],
+                                                       0, 0, 0);
+#elif defined(CDNA2) || defined(CDNA)
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
+                                                      B.x[0],
+                                                      acc[0],
+                                                      0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
+                                                      B.x[1],
+                                                      acc[0],
+                                                      0, 0, 0);
+#endif // defined(CDNA3)
+#else
+        GGML_UNUSED(D);
+        GGML_UNUSED(A);
+        GGML_UNUSED(B);
+        NO_DEVICE_CODE;
+#endif // AMD_MFMA_AVAILABLE
+    }
+
+    static __device__ __forceinline__ void mma(
+            tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) {
+#if defined(AMD_MFMA_AVAILABLE)
+        using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
+        int32x16_t * acc = (int32x16_t *) D.x;
+#if defined(CDNA3)
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
+                                                       ((int64_t *) B.x)[0],
+                                                       acc[0],
+                                                       0, 0, 0);
+#elif defined(CDNA2) || defined(CDNA)
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
+                                                     B.x[0],
+                                                     acc[0],
+                                                     0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
+                                                     B.x[1],
+                                                     acc[0],
+                                                     0, 0, 0);
+#endif // defined(CDNA3)
+#else
+        GGML_UNUSED(D);
+        GGML_UNUSED(A);
+        GGML_UNUSED(B);
+        NO_DEVICE_CODE;
+#endif // AMD_MFMA_AVAILABLE
+    }
 }
@@ -109,7 +109,8 @@ void ggml_cuda_mul_mat_q(
    const int64_t s03 = src0->nb[3] / ts_src0;
    const int64_t s3  =  dst->nb[3] / ts_dst;

-    const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
+    const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
+                            || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));

    if (!ids) {
        const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q(
    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
    // Also its fixup needs to allocate a temporary buffer in the memory pool.
    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
-    const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
-        ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
+    const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
+                            || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
+                            && src1_ncols == ne11;
    const mmq_args args = {
        src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
        ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
@@ -304,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
        return false;
    }

-    if (new_mma_available(cc)) {
+    if (new_mma_available(cc) || amd_mfma_available(cc)) {
        return true;
    }

@@ -44,6 +44,9 @@ static __global__ void k_set_rows_quant(
    block_type * dst_block = dst_row_ptr + i00 / qk;

    quantize_func(src_block, dst_block);
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne13);
 }

 // Template dispatch function for quantized set_rows
@@ -160,7 +160,19 @@
 #endif

 #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
-#define CDNA
+#define CDNA // For the entire family
+#endif
+
+#if defined(__gfx942__)
+#define CDNA3
+#endif
+
+#if defined(__gfx90a__)
+#define CDNA2
+#endif
+
+#if defined(__gfx908__)
+#define CDNA1
 #endif

 #if defined(__GFX12__)
@@ -112,6 +112,11 @@ class DocsGenerator:
        lines.append("")
        lines.append("List of GGML operations and backend support status.")
        lines.append("")
+        lines.append("## How to add a backend to this table:")
+        lines.append("")
+        lines.append("1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)")
+        lines.append("2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`")
+        lines.append("")
        lines.append("Legend:")
        lines.append("- ✅ Fully supported by this backend")
        lines.append("- 🟡 Partially supported by this backend")
@@ -98,7 +98,7 @@ struct llama_hparams {
    float    rope_freq_scale_train;
    float    rope_freq_scale_train_swa;
    uint32_t n_ctx_orig_yarn;
-    float    rope_yarn_log_mul;
+    float    rope_yarn_log_mul = 0.0f;

    std::array<int, 4> rope_sections;

@@ -1369,7 +1369,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    // that have no expert_gating_func model parameter set
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
                }
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);

                switch (hparams.n_layer) {
                    case 27: type = LLM_TYPE_16B; break;
@@ -16191,7 +16191,7 @@ private:
        {
            // PLaMo-2 uses combined QKV tensor
            ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
-            cb(qkv, "qkv", il);
+            cb(qkv, "wqkv", il);

            // split QKV tensor into Q, K, V
            const int64_t n_embd_head_q = hparams.n_embd_head_k;
@@ -16231,7 +16231,7 @@ private:
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );

-            cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
+            cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
        }

        cb(cur, "attn_out", il);
@@ -16306,8 +16306,9 @@ private:
            ggml_build_forward_expand(gf,
                ggml_cpy(ctx0, last_conv,
                    ggml_view_1d(ctx0, conv_states_all,
-                        (d_conv - 1)*(d_inner)*(n_seqs),
-                        kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
+                        (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
+                        kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
+            cb(conv_states_all, "mamba_conv1d_state", il);

            // 1D convolution
            x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
@@ -16370,9 +16371,9 @@ private:
            // store last states
            ggml_build_forward_expand(gf,
                ggml_cpy(ctx0,
-                    ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
-                    ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
-                            kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
+                    ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
+                    ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
+            cb(ssm_states_all, "mamba_ssm_states", il);

            ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
            cb(y, "mamba_y_view", il);
@@ -868,16 +868,30 @@ struct sql_printer : public printer {

 struct csv_printer : public printer {
    void print_header() override {
-        std::vector<std::string> fields = test_result::get_fields();
+
+        std::vector<std::string> fields     = test_result::get_fields();
+        std::vector<std::string> fields_csv = get_fields_csv();
        for (size_t i = 0; i < fields.size(); i++) {
+            if (std::find(std::begin(fields_csv), std::end(fields_csv), fields[i]) == std::end(fields_csv)) {
+                continue;
+            }
            printf("\"%s\"%s", fields[i].c_str(), i < fields.size() - 1 ? "," : "");
        }
        printf("\n");
    }

    void print_test_result(const test_result & result) override {
-        std::vector<std::string> values = result.get_values();
+
+        std::vector<std::string> values     = result.get_values();
+        std::vector<std::string> fields     = test_result::get_fields();
+        std::vector<std::string> fields_csv = get_fields_csv();
+
        for (size_t i = 0; i < values.size(); i++) {
+
+            if (std::find(std::begin(fields_csv), std::end(fields_csv), fields[i]) == std::end(fields_csv)) {
+                continue;
+            }
+
            // Escape quotes and wrap in quotes for CSV
            std::string escaped_value = values[i];
            size_t pos = 0;
@@ -889,6 +903,19 @@ struct csv_printer : public printer {
        }
        printf("\n");
    }
+
+    static std::vector<std::string> get_fields_csv() {
+        return {
+            "op_name",
+            "op_params",
+            "supported",
+            "error_message",
+            "test_mode",
+            "backend_reg_name",
+            "backend_name",
+        };
+    }
+
 };

 static std::unique_ptr<printer> create_printer(output_formats format) {
Author	SHA1	Message	Date
Gabriel Larson	4762ad7316	model : make rope_yarn_log_mul optional for deepseek2 (#14896 ) * make rope_yarn_log_mul optional for deepseek2 * default rope_yarn_log_mul = 0.0f	2025-07-27 11:18:37 +03:00
Shunta Saito	1dc9614e06	llama : fix kq_scale for the attention layers of PLaMo2 (#14892 ) * Fix dimensions for expand * Change dimensions to copy states to cache * Fix the default value for plamo2 conversion * Fix scale given to build_attn * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-07-27 09:38:44 +02:00
Aman Gupta	446595b9b3	Docs: add instructions for adding backends (#14889 )	2025-07-27 09:36:43 +08:00
deepsek	66906cd82a	HIP: Enable Matrix cores for MMQ Kernels, Enable stream-K for CDNA 3 (#14624 ) This commit adds support for MFMA instructions to MMQ. CDNA1/GFX908 CDNA2/GFX90a and CDNA3/GFX942 are supported by the MFMA-enabled code path added by this commit. The code path and stream-k is only enabled on CDNA3 for now as it fails to outperform blas in all cases on the other devices. Blas is currently only consistently outperformed on CDNA3 due to issues in the amd-provided blas libraries. This commit also improves the awareness of MMQ towards different warp sizes and as a side effect improves the performance of all quant formats besides q4_0 and q4_1, which regress slightly, on GCN gpus.	2025-07-27 00:28:14 +02:00
hipudding	11dd5a44eb	CANN: Implement GLU ops (#14884 ) Implement REGLU, GEGLU, SWIGLU ops according to #14158	2025-07-26 17:56:18 +08:00
R0CKSTAR	9b8f3c6c77	musa: fix build warnings (unused variable) (#14869 ) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2025-07-26 10:36:02 +08:00