cuda : prevent integer truncation and overflow errors when using KQ mask strides in flash_attn_mask_to_KV_max kernel (#24945 )

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
model : register t_layer_inp for qwen3next (#25141 )
2026-07-05 20:15:53 +02:00 · 2026-06-30 20:47:05 +02:00 · 2026-06-30 17:57:14 +02:00 · 2026-06-30 16:16:44 +02:00 · 2026-06-30 14:16:24 +02:00 · 2026-06-30 14:06:54 +02:00
26 changed files with 324 additions and 67 deletions
@@ -11,6 +11,11 @@ struct common_http_url {
    std::string path;
 };

+// bracket an IPv6 literal host for a URL authority (RFC 3986)
+static std::string common_http_format_host(const std::string & host) {
+    return host.find(':') != std::string::npos ? "[" + host + "]" : host;
+}
+
 static common_http_url common_http_parse_url(const std::string & url) {
    common_http_url parts;
    auto scheme_end = url.find("://");
@@ -49,11 +54,28 @@ static common_http_url common_http_parse_url(const std::string & url) {
        parts.path = "/";
    }

-    auto colon_pos = parts.host.find(':');
+    // split the authority into host and optional port, a bracketed IPv6 literal keeps its inner colons (RFC 3986)
+    std::string port_str;
+    if (!parts.host.empty() && parts.host.front() == '[') {
+        auto close = parts.host.find(']');
+        if (close == std::string::npos) {
+            throw std::runtime_error("invalid IPv6 URL authority: " + parts.host);
+        }
+        auto after = parts.host.substr(close + 1);
+        if (!after.empty() && after.front() == ':') {
+            port_str = after.substr(1);
+        }
+        parts.host = parts.host.substr(1, close - 1);
+    } else {
+        auto colon_pos = parts.host.find(':');
+        if (colon_pos != std::string::npos) {
+            port_str = parts.host.substr(colon_pos + 1);
+            parts.host = parts.host.substr(0, colon_pos);
+        }
+    }

-    if (colon_pos != std::string::npos) {
-        parts.port = std::stoi(parts.host.substr(colon_pos + 1));
-        parts.host = parts.host.substr(0, colon_pos);
+    if (!port_str.empty()) {
+        parts.port = std::stoi(port_str);
    } else if (parts.scheme == "http") {
        parts.port = 80;
    } else if (parts.scheme == "https") {
@@ -83,7 +105,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
    }
 #endif

-    httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
+    httplib::Client cli(parts.scheme + "://" + common_http_format_host(parts.host) + ":" + std::to_string(parts.port));

    if (!parts.user.empty()) {
        cli.set_basic_auth(parts.user, parts.password);
@@ -95,5 +117,5 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
 }

 static std::string common_http_show_masked_url(const common_http_url & parts) {
-    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
+    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + common_http_format_host(parts.host) + parts.path;
 }
@@ -1551,8 +1551,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
        int split_backend_id = split->backend_id;
        ggml_backend_t split_backend = sched->backends[split_backend_id];

-        ggml_backend_synchronize(split_backend);
-
        // copy the input tensors to the split backend
        for (int input_id = 0; input_id < split->n_inputs; input_id++) {
            ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1563,15 +1561,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
-                } else if (!split_backend->iface.cpy_tensor_async) {
+                } else {
                    ggml_backend_synchronize(split_backend);
                }
-                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
+                ggml_backend_tensor_copy(input, input_cpy);
            } else {
                // wait for the split backend to finish using the input before overwriting it
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
-                } else if (!split_backend->iface.cpy_tensor_async) {
+                } else {
                    ggml_backend_synchronize(split_backend);
                }

@@ -1676,8 +1674,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            }
        }

-        ggml_backend_synchronize(split_backend);
-
        if (!sched->callback_eval) {
            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
            if (ec != GGML_STATUS_SUCCESS) {
@@ -664,7 +664,7 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
 template <int ncols1>
 __launch_bounds__(FATTN_KQ_STRIDE/2, 1)
 static __global__ void flash_attn_mask_to_KV_max(
-        const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int s31, const int s33) {
+        const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int64_t s31, const int64_t s33) {
    const int ne31     = gridDim.x;
    const int tid      = threadIdx.x;
    const int sequence = blockIdx.y;
@@ -1089,8 +1089,8 @@ void launch_fattn(
    // Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
    //     multiple sequences of possibly different lengths.
    if (mask && K->ne[1] % FATTN_KQ_STRIDE == 0 && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
-        const int s31 = mask->nb[1] / sizeof(half2);
-        const int s33 = mask->nb[3] / sizeof(half2);
+        const int64_t s31 = mask->nb[1] / sizeof(half2);
+        const int64_t s33 = mask->nb[3] / sizeof(half2);

        const dim3 blocks_num_KV_max(ntiles_x, Q->ne[3], 1);
        const dim3 block_dim_KV_max(FATTN_KQ_STRIDE/2, 1, 1);
@@ -2003,6 +2003,10 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112,  64)
 DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128,  64)
 DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)

+extern DECL_FATTN_MMA_F16_CASE(512, 512,  4,  2);
+extern DECL_FATTN_MMA_F16_CASE(512, 512,  8,  2);
+extern DECL_FATTN_MMA_F16_CASE(512, 512, 16,  2);
+extern DECL_FATTN_MMA_F16_CASE(512, 512, 32,  2);
 extern DECL_FATTN_MMA_F16_CASE(512, 512,  2,  4);
 extern DECL_FATTN_MMA_F16_CASE(512, 512,  4,  4);
 extern DECL_FATTN_MMA_F16_CASE(512, 512,  8,  4);
@@ -76,6 +76,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  2,  64, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  64,  64)
@@ -144,6 +145,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2,  32,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  2,  64, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  32,  64)
@@ -219,6 +221,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 512, 1, 128,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  2,  64, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  64,  64)
@@ -296,6 +299,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 128,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  2,  64, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 4,  64,  64)
@@ -1308,12 +1312,12 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
            return;
        }

-        if constexpr (DV <= 256) {
-            if (use_gqa_opt && gqa_ratio % 2 == 0) {
-                launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
-                return;
-            }
+        if (use_gqa_opt && gqa_ratio % 2 == 0) {
+            launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
+            return;
+        }

+        if constexpr (DV <= 256) {
            launch_fattn_tile_switch_ncols1<DKQ, DV, 1, use_logit_softcap>(ctx, dst);
            return;
        }
@@ -99,12 +99,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
        return;
    }

-    if constexpr (DKQ <= 256) {
-        if (use_gqa_opt && gqa_ratio > 1) {
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
-            return;
-        }
+    if (use_gqa_opt && gqa_ratio > 1) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
+        return;
+    }

+    if constexpr (DKQ <= 256) {
        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
    } else {
        GGML_ABORT("fatal error");
@@ -78,26 +78,29 @@ static __global__ void k_get_rows_float(

 template<typename grad_t, typename dst_t>
 static __global__ void k_get_rows_back_float(
-        const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
+        const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst,
+        const int64_t ncols, const int64_t nrows_grad, const int64_t nrows_dst) {
    const int col = blockIdx.x*blockDim.x + threadIdx.x;

    if (col >= ncols) {
        return;
    }

-    const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
-
-    float sum = 0.0f;
-
    ggml_cuda_pdl_sync();
-    for (int64_t i = 0; i < nrows_grad; ++i) {
-        if (rows[i] != dst_row) {
-            continue;
-        }
-        sum += grad[i*ncols + col];
-    }

-    dst[dst_row*ncols + col] = sum;
+    // grid.y is clamped to the CUDA grid limit, so stride over the destination rows
+    for (int64_t dst_row = blockIdx.y; dst_row < nrows_dst; dst_row += gridDim.y) {
+        float sum = 0.0f;
+
+        for (int64_t i = 0; i < nrows_grad; ++i) {
+            if (rows[i] != dst_row) {
+                continue;
+            }
+            sum += grad[i*ncols + col];
+        }
+
+        dst[dst_row*ncols + col] = sum;
+    }
 }

 template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
@@ -302,7 +305,7 @@ void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * d

    const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
-    const dim3 block_nums(block_num_x, ne1, 1);
+    const dim3 block_nums(block_num_x, MIN(ne1, (int64_t)UINT16_MAX), 1);

-    k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
+    k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10, ne1);
 }
@@ -3192,24 +3192,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;

-    // Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
-    // Excluding this path for HIP and MUSA as a precaution.
-    // According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways.
-    // Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior.
-    // e.g. https://github.com/ROCm/rocm-systems/issues/5109
-    // It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA.
-    // This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends.
-#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
-    const bool copy_from_host = false;
-#else
-    const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
-#endif
-
-    if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
+    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
        return false;
    }

-    if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
+    if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
        return false;
    }

@@ -3220,17 +3207,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
    ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
    ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;

-    if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
-        !copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
+    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
 #endif // NDEBUG
        return false;
    }

-    if (copy_from_host) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
-    } else if (backend_src != backend_dst) {
+    if (backend_src != backend_dst) {
        // copy on src stream
        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -368,5 +368,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
        return true;
    }

+    // gfx900 (Vega 10) lacks native dp4a, loses to dequant + hipBLAS
+    // for dense matrices; keep MMQ only for MoE, where the
+    // hipBLAS path is much slower.
+    if (cc == GGML_CUDA_CC_VEGA) {
+        return n_experts > 0;
+    }
+
    return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
 DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
 DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
 DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
+DECL_FATTN_MMA_F16_CASE(512, 512, 16, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
 DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
 DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
 DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
+DECL_FATTN_MMA_F16_CASE(512, 512, 32, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
 DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
 DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
 DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
+DECL_FATTN_MMA_F16_CASE(512, 512, 4, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
 DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
 DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
 DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
+DECL_FATTN_MMA_F16_CASE(512, 512, 8, 2);
@@ -92,7 +92,7 @@ for ncols in [8, 16, 32, 64]:
                    continue
                if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
                    continue
-                if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
+                if head_size_kq == 512 and ncols2 not in (2, 4, 8): # Gemma 4 (+ MTP)
                    continue
                if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
                    continue
@@ -2475,6 +2475,85 @@ static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count
    return true;
 }

+// Remove the loop unrolling hint of the matmul shader's BK loop
+// and replace it with the dont_unroll hint for better performance on
+// hardware like Apple M1/M2.
+// Assumes 1. code comes from mul_mm.comp 2. the K-tile loop has no loop
+// control hint and 3. the BK loop is the last loop nested directly inside
+// the K-tile loop.
+// Returns true when the input was modified; returns false otherwise
+// without touching `out`.
+static bool ggml_vk_roll_bk_loop(const uint32_t * code, size_t word_count, std::vector<uint32_t> & out) {
+    if (word_count < 5) {
+        return false;
+    }
+
+    struct vk_spv_loop {
+        size_t   header;
+        size_t   end;
+        uint32_t control;
+    };
+
+    std::vector<vk_spv_loop> loops;
+
+    // Collect a list of all loops in the module.
+    for (size_t pos = 5; pos < word_count; ) {
+        const uint32_t wc = code[pos] >> spv::WordCountShift;
+        const uint32_t op = code[pos] & spv::OpCodeMask;
+        if (wc == 0 || pos + wc > word_count) {
+            return false;
+        }
+
+        if (op == spv::OpLoopMerge && wc >= 4) { loops.push_back({ pos, 0, code[pos + 3] }); }
+
+        if (op == spv::OpLabel && wc >= 2) {
+            for (auto & l : loops) {
+                if (l.end == 0 && code[l.header + 1] == code[pos + 1]) { l.end = pos; }
+            }
+        }
+
+        pos += wc;
+    }
+
+    auto encloses = [](const vk_spv_loop & a, const vk_spv_loop & b) {
+        return a.header < b.header && b.header < a.end;
+    };
+
+    // Find the BK loop.
+    const vk_spv_loop * bk = nullptr;
+    for (const auto & h : loops) {
+        if (h.control != spv::LoopControlUnrollMask) {
+            continue;
+        }
+        const vk_spv_loop * parent = nullptr;
+        bool has_child = false;
+        for (const auto & g : loops) {
+            if (encloses(g, h) && (!parent || g.header > parent->header)) {
+                parent = &g;
+            }
+            if (encloses(h, g)) {
+                has_child = true;
+            }
+        }
+        // BK loop should be the last loop nested inside the loop with no hint
+        // and have at least one child loop.
+        if (parent &&
+            parent->control == spv::LoopControlMaskNone &&
+            has_child &&
+            (!bk || h.header > bk->header)) {
+            bk = &h;
+        }
+    }
+    if (!bk) {
+        return false;
+    }
+
+    // set DontUnroll instead of Unroll
+    out.assign(code, code + word_count);
+    out[bk->header + 3] = spv::LoopControlDontUnrollMask;
+    return true;
+}
+
 static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint,
                                         uint32_t parameter_count, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
                                         bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
@@ -2558,6 +2637,22 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
    }
 #endif

+#if VK_HEADER_VERSION >= 287
+    // Roll the mul_mm BK loop on Asahi Linux. Skip bf16 and the mul_mmq pipelines.
+    if (device->driver_id == vk::DriverId::eMesaHoneykrisp &&
+        pipeline->name.rfind("matmul", 0) == 0 &&
+        pipeline->name.find("bf16") == std::string::npos &&
+        pipeline->name.find("q8_1") == std::string::npos) {
+        const uint32_t * src   = spirv.empty() ? reinterpret_cast<const uint32_t *>(spv_data) : spirv.data();
+        size_t           src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size();
+        std::vector<uint32_t> rolled;
+        if (ggml_vk_roll_bk_loop(src, src_n, rolled)) {
+            spirv = std::move(rolled);
+            shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
+        }
+    }
+#endif
+
    pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);

    vk::PushConstantRange pcr(
@@ -1563,6 +1563,7 @@ class ggml_webgpu_shader_lib {
                        case GGML_TYPE_IQ1_S:
                        case GGML_TYPE_IQ4_NL:
                        case GGML_TYPE_MXFP4:
+                        case GGML_TYPE_NVFP4:
                            {
                                // Quantized types using u32 buffers for portability.
                                defines.push_back("SRC_TYPE=u32");
@@ -1593,6 +1594,8 @@ class ggml_webgpu_shader_lib {
                    } else if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
                               key.src_type == GGML_TYPE_IQ4_NL || key.src_type == GGML_TYPE_MXFP4) {
                        defines.push_back("BLOCK_SIZE=32u");
+                    } else if (key.src_type == GGML_TYPE_NVFP4) {
+                        defines.push_back("BLOCK_SIZE=64u");
                    } else if (key.src_type >= GGML_TYPE_Q2_K) {
                        defines.push_back("BLOCK_SIZE=256u");
                    } else {
@@ -1960,6 +1963,7 @@ class ggml_webgpu_shader_lib {
                            defines.push_back(type_upper + "_TABLES");
                            break;
                        case GGML_TYPE_MXFP4:
+                        case GGML_TYPE_NVFP4:
                            defines.push_back(type_upper + "_LUT");
                            break;
                        default:
@@ -2103,6 +2107,7 @@ class ggml_webgpu_shader_lib {
                            defines.push_back(type_upper + "_TABLES");
                            break;
                        case GGML_TYPE_MXFP4:
+                        case GGML_TYPE_NVFP4:
                            defines.push_back(type_upper + "_LUT");
                            break;
                        default:
@@ -2274,6 +2279,7 @@ class ggml_webgpu_shader_lib {
                            defines.push_back(type_upper + "_TABLES");
                            break;
                        case GGML_TYPE_MXFP4:
+                        case GGML_TYPE_NVFP4:
                            defines.push_back(type_upper + "_LUT");
                            break;
                        default:
@@ -2394,6 +2400,7 @@ class ggml_webgpu_shader_lib {
                            defines.push_back(type_upper + "_TABLES");
                            break;
                        case GGML_TYPE_MXFP4:
+                        case GGML_TYPE_NVFP4:
                            defines.push_back(type_upper + "_LUT");
                            break;
                        default:
@@ -4056,6 +4056,7 @@ static bool ggml_webgpu_supported_qtype(ggml_type type) {
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_MXFP4:
+        case GGML_TYPE_NVFP4:
            return true;
        default:
            return false;
@@ -4156,6 +4157,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                            case GGML_TYPE_IQ4_NL:
                            case GGML_TYPE_IQ4_XS:
                            case GGML_TYPE_MXFP4:
+                            case GGML_TYPE_NVFP4:
                                supports_op = true;
                                break;
                            default:
@@ -4196,6 +4198,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                        case GGML_TYPE_IQ4_NL:
                        case GGML_TYPE_IQ4_XS:
                        case GGML_TYPE_MXFP4:
+                        case GGML_TYPE_NVFP4:
                            supports_op = true;
                            break;
                        default:
@@ -896,9 +896,23 @@ const kvalues_iq4nl = array<i32, 16>(

 #endif

-#ifdef MXFP4_LUT
+#if defined(MXFP4_LUT) || defined(NVFP4_LUT)
 const kvalues_mxfp4 = array<i32, 16>(
    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12
 );
-#endif
+#endif // MXFP4_LUT || NVFP4_LUT

+#ifdef NVFP4_LUT
+fn ue4m3_to_fp32(u: u32) -> f32 {
+    if (u == 0u || u == 127u) {
+        return 0.0;
+    }
+    let exp = (u >> 3u) & 15u;
+    let man = u & 7u;
+    if (exp == 0u) {
+        return f32(man) * (1.0 / 512.0);
+    }
+    let bits = ((exp + 120u) << 23u) | (man << 20u);
+    return bitcast<f32>(bits);
+}
+#endif // NVFP4_LUT
@@ -672,6 +672,27 @@ fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
 }
 #endif

+#ifdef NVFP4
+fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
+    let block_byte_base = (src_base + offset) * 36;
+    let d_word = load_u32_at_src(block_byte_base);
+    for (var sub: u32 = 0u; sub < 4; sub++) {
+        let d = ue4m3_to_fp32(get_byte(d_word, sub)) * 0.5;
+        for (var j: u32 = 0u; j < 2; j++) {
+            let q_packed = load_u32_at_src(block_byte_base + 4 + sub * 8 + j * 4);
+            for (var k: u32 = 0; k < 4; k++) {
+                let q_byte = get_byte(q_packed, k);
+                let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * d;
+                let q_hi = f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * d;
+                let dst_offset = dst_base + offset * 64 + sub * 16 + j * 4 + k;
+                dst[dst_offset] = q_lo;
+                dst[dst_offset + 8u] = q_hi;
+            }
+        }
+    }
+}
+#endif
+

@group(0) @binding(0)
 var<storage, read_write> src: array<SRC_TYPE>;
@@ -241,7 +241,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
 #endif // INIT_SRC0_SHMEM_Q8_1

 #if defined(INIT_SRC0_SHMEM_MXFP4)
-            let block_byte_base = src0_idx * 17u;
+            let block_byte_base = src0_idx * 17u; // BLOCK_SIZE_BYTES = 17u;
            let eu8 = get_byte(load_u32_at_src0_aligned(block_byte_base), block_byte_base & 3u);
            let e = ldexp(1.0, i32(eu8) - 128);

@@ -263,6 +263,47 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
 }
 #endif // legacy-quants

+#if defined(INIT_SRC0_SHMEM_NVFP4)
+const BLOCK_SIZE = 64u;
+const BLOCK_SIZE_BYTES = 36u;
+const SUB_BLOCK_SIZE = 16u; // elements sharing one UE4M3 scale
+const NQ = 16u;
+const BYTES_PER_THREAD = 8u;
+const BYTES_PER_INNER_LOOP = 4u;
+
+fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
+    for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
+        let tile_m = i / TILE_K;
+        let tile_k_start = i % TILE_K;
+        let global_m = offset_m + tile_m;
+        let global_k_start = k_outer + tile_k_start;
+
+        if (global_m >= params.m) {
+            break;
+        }
+
+        let block_k  = global_k_start / BLOCK_SIZE;
+        let sub_block      = (global_k_start % BLOCK_SIZE) / SUB_BLOCK_SIZE;
+        let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
+
+        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
+        let d_byte_base     = block_byte_base;
+        let qs_byte_base    = block_byte_base + 4u;
+
+        let d = ue4m3_to_fp32(get_byte(load_u32_at_src0_aligned(d_byte_base), sub_block)) * 0.5;
+
+        for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j++) {
+            let q_packed = load_u32_at_src0_aligned(qs_byte_base + sub_block * 8u + j * 4u);
+            for (var k = 0u; k < BYTES_PER_INNER_LOOP; k++) {
+                let q_byte = get_byte(q_packed, k);
+                shmem[i + j * BYTES_PER_INNER_LOOP + k]      = f16(f32(kvalues_mxfp4[q_byte & 0xF]) * d);
+                shmem[i + j * BYTES_PER_INNER_LOOP + k + 8u] = f16(f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * d);
+            }
+        }
+    }
+}
+#endif // INIT_SRC0_SHMEM_NVFP4
+
 // k-quants
 #if defined(INIT_SRC0_SHMEM_Q2_K) || defined(INIT_SRC0_SHMEM_Q3_K) || defined(INIT_SRC0_SHMEM_Q4_K) || defined(INIT_SRC0_SHMEM_Q5_K) || defined(INIT_SRC0_SHMEM_Q6_K)
 const BLOCK_SIZE = 256u;
@@ -1505,3 +1505,49 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src
    return acc;
 }
 #endif
+
+#ifdef MUL_ACC_NVFP4
+#define BLOCK_SIZE 64
+#define BLOCK_SIZE_BYTES 36
+#define THREADS_PER_BLOCK 4
+#define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK)
+fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array<array<f32, OUTPUTS_PER_WG>, NUM_COLS> {
+    var acc: array<array<f32, OUTPUTS_PER_WG>, NUM_COLS>;
+
+    let num_blocks = params.k / BLOCK_SIZE;
+    let sub = thread_id % THREADS_PER_BLOCK;
+    for (var block = thread_id/THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE/THREADS_PER_BLOCK) {
+        let x_base = src1_idx_base + block * BLOCK_SIZE + sub * ELEMS_PER_THREAD;
+        var x_block: array<array<f32, ELEMS_PER_THREAD>, NUM_COLS>;
+        for (var col = 0u; col < NUM_COLS;col += 1) {
+            for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) {
+                x_block[col][i]     = f32(src1[x_base + col * params.stride_11 + i]);
+                x_block[col][i + 8] = f32(src1[x_base + col * params.stride_11 + i + 8]);
+            }
+        }
+        for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
+            let output_row = row_base + row;
+            if (output_row < params.m) {
+                let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
+                let d = ue4m3_to_fp32(get_byte(load_u32_at_src0_aligned(block_byte_base), sub)) * 0.5;
+                let q_w0 = load_u32_at_src0_aligned(block_byte_base + 4u + 8u * sub);
+                let q_w1 = load_u32_at_src0_aligned(block_byte_base + 8u + 8u * sub);
+                for (var col = 0u;col < NUM_COLS;col += 1) {
+                    var row_sum = 0.0;
+                    for (var l = 0u; l < 8u; l++) {
+                        let q_word = select(q_w0, q_w1, l >= 4u);
+                        let q_byte = get_byte(q_word, l % 4u);
+                        let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * d;
+                        let q_hi = f32(kvalues_mxfp4[(q_byte >> 4u) & 0xFu]) * d;
+                        row_sum += q_lo * x_block[col][l];
+                        row_sum += q_hi * x_block[col][l + 8u];
+                    }
+                    acc[col][row] += row_sum;
+                }
+            }
+        }
+    }
+
+    return acc;
+}
+#endif
@@ -121,6 +121,8 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        ggml_tensor * inpSA = inpL;

        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
@@ -7759,6 +7759,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    }

    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
+    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 70000, 4, 1, false)); // row count > CUDA grid-y limit (65535)
    for (ggml_type type : all_types) {
        for (bool v : {false, true}) {
            test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
@@ -39,7 +39,7 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
        throw std::runtime_error("unsupported URL scheme in target URL: " + parsed_url.scheme);
    }

-    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
+    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), common_http_format_host(parsed_url.host).c_str(), parsed_url.port, parsed_url.path.c_str());

    std::map<std::string, std::string> headers;
    const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "http.h"
 #include "server-http.h"
 #include "server-stream.h"
 #include "server-common.h"
@@ -441,7 +442,7 @@ bool server_http_context::start() {
    srv->wait_until_ready();

    listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
-                                : string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port);
+                                : string_format("%s://%s:%d", is_ssl ? "https" : "http", common_http_format_host(hostname).c_str(), port);
    return true;
 }

@@ -1,4 +1,5 @@
 #include "server-common.h"
+#include "http.h"
 #include "server-models.h"
 #include "server-context.h"
 #include "server-stream.h"
@@ -2263,7 +2264,8 @@ server_http_proxy::server_http_proxy(
            }
            if (lowered == "host") {
                bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80);
-                req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port));
+                const std::string url_host = common_http_format_host(host);
+                req.set_header(key, is_default_port ? url_host : url_host + ":" + std::to_string(port));
            } else {
                req.set_header(key, value);
            }
Author	SHA1	Message	Date
fairydreaming	0eca4d490e	cuda : prevent integer truncation and overflow errors when using KQ mask strides in flash_attn_mask_to_KV_max kernel (#24945 ) Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>	2026-06-30 20:47:05 +02:00
Jürgen Schmied	4f31eedb0c	model : register t_layer_inp for qwen3next (#25141 ) * Fix input assignment in layer processing loop Fix DFLASH for qwen-coder-next * add line break Added tensor for attention normalization in Qwen3 model.	2026-06-30 17:57:14 +02:00
Pascal	799fcc04a5	common,server: handle bracketed IPv6 literals in URL authority (#25140 ) * common,server: handle bracketed IPv6 literals in URL authority Parse the [host]:port form (RFC 3986) and bracket IPv6 hosts when formatting a URL authority: listening log, proxy Host header, proxy log, client rebuild. The per-request remote_addr stays bare. * common: restore unsupported scheme throw in url parser Address @ngxson review: keep the explicit reject in port resolution so the block stays self-contained. Non-http(s) schemes still throw (also gated at the top of common_http_parse_url).	2026-06-30 16:16:44 +02:00
Matt Jallo	931eb37f8c	CUDA: fix get_rows_back for tables with more than 65535 rows (grid-y clamp + stride) (#25103 )	2026-06-30 14:16:24 +02:00
Johannes Gäßler	e495d1e748	CUDA: fix Gemma E4B MTP FlashAttention (#25148 ) * CUDA: fix Gemma E4B MTP FlashAttention * remove unused template declaration	2026-06-30 14:06:54 +02:00
Kevin Liu	f708a5b2ca	vulkan: roll bk loop in matmul for asahi linux (#24663 ) * vulkan: roll bk loop in matmul for asahi linux * vulkan: fix inline comment * vulkan: revert BK-loop unroll change * vulkan: edit spirv directly for asahi roll bk loop * vulkan: remove trailing whitespace at the end of comments	2026-06-30 12:27:38 +02:00
zduford	d9df11006f	HIP: use hipBLAS for dense prefill on gfx900, keep MMQ for MoE (#24588 ) * HIP: keep MMQ for gfx900 MoE and Q8_0, use hipBLAS for dense K-quants Assisted-by: GitHub Copilot CLI * HIP: tighten conditional block to be explicitly for gfx900 * HIP: Further simplified gfx900 conditional block * removed unnecessary comment	2026-06-30 11:51:38 +02:00
Masashi Yoshimura	6c5de1cc83	ggml-webgpu: add support for NVFP4 (#25143 )	2026-06-30 17:20:04 +09:00
Oliver Simons	86b94708f2	Revert "sched : reintroduce less synchronizations during split compute (#20793 )" (#25138 )	2026-06-30 08:41:45 +08:00