Compare commits

...

9 Commits

Author SHA1 Message Date
fairydreaming 0eca4d490e cuda : prevent integer truncation and overflow errors when using KQ mask strides in flash_attn_mask_to_KV_max kernel (#24945)
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2026-06-30 20:47:05 +02:00
Jürgen Schmied 4f31eedb0c model : register t_layer_inp for qwen3next (#25141)
* Fix input assignment in layer processing loop

Fix DFLASH for qwen-coder-next

* add line break

Added tensor for attention normalization in Qwen3 model.
2026-06-30 17:57:14 +02:00
Pascal 799fcc04a5 common,server: handle bracketed IPv6 literals in URL authority (#25140)
* common,server: handle bracketed IPv6 literals in URL authority

Parse the [host]:port form (RFC 3986) and bracket IPv6 hosts when
formatting a URL authority: listening log, proxy Host header, proxy
log, client rebuild. The per-request remote_addr stays bare.

* common: restore unsupported scheme throw in url parser

Address @ngxson review: keep the explicit reject in port resolution so
the block stays self-contained. Non-http(s) schemes still throw (also
gated at the top of common_http_parse_url).
2026-06-30 16:16:44 +02:00
Matt Jallo 931eb37f8c CUDA: fix get_rows_back for tables with more than 65535 rows (grid-y clamp + stride) (#25103) 2026-06-30 14:16:24 +02:00
Johannes Gäßler e495d1e748 CUDA: fix Gemma E4B MTP FlashAttention (#25148)
* CUDA: fix Gemma E4B MTP FlashAttention

* remove unused template declaration
2026-06-30 14:06:54 +02:00
Kevin Liu f708a5b2ca vulkan: roll bk loop in matmul for asahi linux (#24663)
* vulkan: roll bk loop in matmul for asahi linux

* vulkan: fix inline comment

* vulkan: revert BK-loop unroll change

* vulkan: edit spirv directly for asahi roll bk loop

* vulkan: remove trailing whitespace at the end of comments
2026-06-30 12:27:38 +02:00
zduford d9df11006f HIP: use hipBLAS for dense prefill on gfx900, keep MMQ for MoE (#24588)
* HIP: keep MMQ for gfx900 MoE and Q8_0, use hipBLAS for dense K-quants

Assisted-by: GitHub Copilot CLI

* HIP: tighten conditional block to be explicitly for gfx900

* HIP: Further simplified gfx900 conditional block

* removed unnecessary comment
2026-06-30 11:51:38 +02:00
Masashi Yoshimura 6c5de1cc83 ggml-webgpu: add support for NVFP4 (#25143) 2026-06-30 17:20:04 +09:00
Oliver Simons 86b94708f2 Revert "sched : reintroduce less synchronizations during split compute (#20793)" (#25138) 2026-06-30 08:41:45 +08:00
26 changed files with 324 additions and 67 deletions
+28 -6
View File
@@ -11,6 +11,11 @@ struct common_http_url {
std::string path;
};
// bracket an IPv6 literal host for a URL authority (RFC 3986)
static std::string common_http_format_host(const std::string & host) {
return host.find(':') != std::string::npos ? "[" + host + "]" : host;
}
static common_http_url common_http_parse_url(const std::string & url) {
common_http_url parts;
auto scheme_end = url.find("://");
@@ -49,11 +54,28 @@ static common_http_url common_http_parse_url(const std::string & url) {
parts.path = "/";
}
auto colon_pos = parts.host.find(':');
// split the authority into host and optional port, a bracketed IPv6 literal keeps its inner colons (RFC 3986)
std::string port_str;
if (!parts.host.empty() && parts.host.front() == '[') {
auto close = parts.host.find(']');
if (close == std::string::npos) {
throw std::runtime_error("invalid IPv6 URL authority: " + parts.host);
}
auto after = parts.host.substr(close + 1);
if (!after.empty() && after.front() == ':') {
port_str = after.substr(1);
}
parts.host = parts.host.substr(1, close - 1);
} else {
auto colon_pos = parts.host.find(':');
if (colon_pos != std::string::npos) {
port_str = parts.host.substr(colon_pos + 1);
parts.host = parts.host.substr(0, colon_pos);
}
}
if (colon_pos != std::string::npos) {
parts.port = std::stoi(parts.host.substr(colon_pos + 1));
parts.host = parts.host.substr(0, colon_pos);
if (!port_str.empty()) {
parts.port = std::stoi(port_str);
} else if (parts.scheme == "http") {
parts.port = 80;
} else if (parts.scheme == "https") {
@@ -83,7 +105,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
}
#endif
httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
httplib::Client cli(parts.scheme + "://" + common_http_format_host(parts.host) + ":" + std::to_string(parts.port));
if (!parts.user.empty()) {
cli.set_basic_auth(parts.user, parts.password);
@@ -95,5 +117,5 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
}
static std::string common_http_show_masked_url(const common_http_url & parts) {
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + common_http_format_host(parts.host) + parts.path;
}
+3 -7
View File
@@ -1551,8 +1551,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id];
ggml_backend_synchronize(split_backend);
// copy the input tensors to the split backend
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1563,15 +1561,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else if (!split_backend->iface.cpy_tensor_async) {
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
ggml_backend_tensor_copy(input, input_cpy);
} else {
// wait for the split backend to finish using the input before overwriting it
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else if (!split_backend->iface.cpy_tensor_async) {
} else {
ggml_backend_synchronize(split_backend);
}
@@ -1676,8 +1674,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
ggml_backend_synchronize(split_backend);
if (!sched->callback_eval) {
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
if (ec != GGML_STATUS_SUCCESS) {
+3 -3
View File
@@ -664,7 +664,7 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
template <int ncols1>
__launch_bounds__(FATTN_KQ_STRIDE/2, 1)
static __global__ void flash_attn_mask_to_KV_max(
const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int s31, const int s33) {
const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int64_t s31, const int64_t s33) {
const int ne31 = gridDim.x;
const int tid = threadIdx.x;
const int sequence = blockIdx.y;
@@ -1089,8 +1089,8 @@ void launch_fattn(
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
// multiple sequences of possibly different lengths.
if (mask && K->ne[1] % FATTN_KQ_STRIDE == 0 && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
const int s31 = mask->nb[1] / sizeof(half2);
const int s33 = mask->nb[3] / sizeof(half2);
const int64_t s31 = mask->nb[1] / sizeof(half2);
const int64_t s33 = mask->nb[3] / sizeof(half2);
const dim3 blocks_num_KV_max(ntiles_x, Q->ne[3], 1);
const dim3 block_dim_KV_max(FATTN_KQ_STRIDE/2, 1, 1);
+4
View File
@@ -2003,6 +2003,10 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256, 64)
extern DECL_FATTN_MMA_F16_CASE(512, 512, 4, 2);
extern DECL_FATTN_MMA_F16_CASE(512, 512, 8, 2);
extern DECL_FATTN_MMA_F16_CASE(512, 512, 16, 2);
extern DECL_FATTN_MMA_F16_CASE(512, 512, 32, 2);
extern DECL_FATTN_MMA_F16_CASE(512, 512, 2, 4);
extern DECL_FATTN_MMA_F16_CASE(512, 512, 4, 4);
extern DECL_FATTN_MMA_F16_CASE(512, 512, 8, 4);
+9 -5
View File
@@ -76,6 +76,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
@@ -144,6 +145,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 32, 64)
@@ -219,6 +221,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 512, 1, 128, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
@@ -296,6 +299,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 128, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 4, 64, 64)
@@ -1308,12 +1312,12 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
return;
}
if constexpr (DV <= 256) {
if (use_gqa_opt && gqa_ratio % 2 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
return;
}
if (use_gqa_opt && gqa_ratio % 2 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
return;
}
if constexpr (DV <= 256) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 1, use_logit_softcap>(ctx, dst);
return;
}
+5 -5
View File
@@ -99,12 +99,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
return;
}
if constexpr (DKQ <= 256) {
if (use_gqa_opt && gqa_ratio > 1) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
if (use_gqa_opt && gqa_ratio > 1) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
if constexpr (DKQ <= 256) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
} else {
GGML_ABORT("fatal error");
+17 -14
View File
@@ -78,26 +78,29 @@ static __global__ void k_get_rows_float(
template<typename grad_t, typename dst_t>
static __global__ void k_get_rows_back_float(
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst,
const int64_t ncols, const int64_t nrows_grad, const int64_t nrows_dst) {
const int col = blockIdx.x*blockDim.x + threadIdx.x;
if (col >= ncols) {
return;
}
const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
float sum = 0.0f;
ggml_cuda_pdl_sync();
for (int64_t i = 0; i < nrows_grad; ++i) {
if (rows[i] != dst_row) {
continue;
}
sum += grad[i*ncols + col];
}
dst[dst_row*ncols + col] = sum;
// grid.y is clamped to the CUDA grid limit, so stride over the destination rows
for (int64_t dst_row = blockIdx.y; dst_row < nrows_dst; dst_row += gridDim.y) {
float sum = 0.0f;
for (int64_t i = 0; i < nrows_grad; ++i) {
if (rows[i] != dst_row) {
continue;
}
sum += grad[i*ncols + col];
}
dst[dst_row*ncols + col] = sum;
}
}
template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
@@ -302,7 +305,7 @@ void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * d
const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne1, 1);
const dim3 block_nums(block_num_x, MIN(ne1, (int64_t)UINT16_MAX), 1);
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10, ne1);
}
+4 -20
View File
@@ -3192,24 +3192,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
// Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
// Excluding this path for HIP and MUSA as a precaution.
// According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways.
// Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior.
// e.g. https://github.com/ROCm/rocm-systems/issues/5109
// It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA.
// This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends.
#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
const bool copy_from_host = false;
#else
const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
#endif
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
return false;
}
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
return false;
}
@@ -3220,17 +3207,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif // NDEBUG
return false;
}
if (copy_from_host) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
} else if (backend_src != backend_dst) {
if (backend_src != backend_dst) {
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
+7
View File
@@ -368,5 +368,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
return true;
}
// gfx900 (Vega 10) lacks native dp4a, loses to dequant + hipBLAS
// for dense matrices; keep MMQ only for MoE, where the
// hipBLAS path is much slower.
if (cc == GGML_CUDA_CC_VEGA) {
return n_experts > 0;
}
return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
DECL_FATTN_MMA_F16_CASE(512, 512, 16, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
DECL_FATTN_MMA_F16_CASE(512, 512, 32, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
DECL_FATTN_MMA_F16_CASE(512, 512, 4, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
DECL_FATTN_MMA_F16_CASE(512, 512, 8, 2);
@@ -92,7 +92,7 @@ for ncols in [8, 16, 32, 64]:
continue
if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
continue
if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
if head_size_kq == 512 and ncols2 not in (2, 4, 8): # Gemma 4 (+ MTP)
continue
if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
continue
+95
View File
@@ -2475,6 +2475,85 @@ static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count
return true;
}
// Remove the loop unrolling hint of the matmul shader's BK loop
// and replace it with the dont_unroll hint for better performance on
// hardware like Apple M1/M2.
// Assumes 1. code comes from mul_mm.comp 2. the K-tile loop has no loop
// control hint and 3. the BK loop is the last loop nested directly inside
// the K-tile loop.
// Returns true when the input was modified; returns false otherwise
// without touching `out`.
static bool ggml_vk_roll_bk_loop(const uint32_t * code, size_t word_count, std::vector<uint32_t> & out) {
if (word_count < 5) {
return false;
}
struct vk_spv_loop {
size_t header;
size_t end;
uint32_t control;
};
std::vector<vk_spv_loop> loops;
// Collect a list of all loops in the module.
for (size_t pos = 5; pos < word_count; ) {
const uint32_t wc = code[pos] >> spv::WordCountShift;
const uint32_t op = code[pos] & spv::OpCodeMask;
if (wc == 0 || pos + wc > word_count) {
return false;
}
if (op == spv::OpLoopMerge && wc >= 4) { loops.push_back({ pos, 0, code[pos + 3] }); }
if (op == spv::OpLabel && wc >= 2) {
for (auto & l : loops) {
if (l.end == 0 && code[l.header + 1] == code[pos + 1]) { l.end = pos; }
}
}
pos += wc;
}
auto encloses = [](const vk_spv_loop & a, const vk_spv_loop & b) {
return a.header < b.header && b.header < a.end;
};
// Find the BK loop.
const vk_spv_loop * bk = nullptr;
for (const auto & h : loops) {
if (h.control != spv::LoopControlUnrollMask) {
continue;
}
const vk_spv_loop * parent = nullptr;
bool has_child = false;
for (const auto & g : loops) {
if (encloses(g, h) && (!parent || g.header > parent->header)) {
parent = &g;
}
if (encloses(h, g)) {
has_child = true;
}
}
// BK loop should be the last loop nested inside the loop with no hint
// and have at least one child loop.
if (parent &&
parent->control == spv::LoopControlMaskNone &&
has_child &&
(!bk || h.header > bk->header)) {
bk = &h;
}
}
if (!bk) {
return false;
}
// set DontUnroll instead of Unroll
out.assign(code, code + word_count);
out[bk->header + 3] = spv::LoopControlDontUnrollMask;
return true;
}
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint,
uint32_t parameter_count, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
@@ -2558,6 +2637,22 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
}
#endif
#if VK_HEADER_VERSION >= 287
// Roll the mul_mm BK loop on Asahi Linux. Skip bf16 and the mul_mmq pipelines.
if (device->driver_id == vk::DriverId::eMesaHoneykrisp &&
pipeline->name.rfind("matmul", 0) == 0 &&
pipeline->name.find("bf16") == std::string::npos &&
pipeline->name.find("q8_1") == std::string::npos) {
const uint32_t * src = spirv.empty() ? reinterpret_cast<const uint32_t *>(spv_data) : spirv.data();
size_t src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size();
std::vector<uint32_t> rolled;
if (ggml_vk_roll_bk_loop(src, src_n, rolled)) {
spirv = std::move(rolled);
shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
}
}
#endif
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
vk::PushConstantRange pcr(
@@ -1563,6 +1563,7 @@ class ggml_webgpu_shader_lib {
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
{
// Quantized types using u32 buffers for portability.
defines.push_back("SRC_TYPE=u32");
@@ -1593,6 +1594,8 @@ class ggml_webgpu_shader_lib {
} else if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
key.src_type == GGML_TYPE_IQ4_NL || key.src_type == GGML_TYPE_MXFP4) {
defines.push_back("BLOCK_SIZE=32u");
} else if (key.src_type == GGML_TYPE_NVFP4) {
defines.push_back("BLOCK_SIZE=64u");
} else if (key.src_type >= GGML_TYPE_Q2_K) {
defines.push_back("BLOCK_SIZE=256u");
} else {
@@ -1960,6 +1963,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(type_upper + "_TABLES");
break;
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
defines.push_back(type_upper + "_LUT");
break;
default:
@@ -2103,6 +2107,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(type_upper + "_TABLES");
break;
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
defines.push_back(type_upper + "_LUT");
break;
default:
@@ -2274,6 +2279,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(type_upper + "_TABLES");
break;
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
defines.push_back(type_upper + "_LUT");
break;
default:
@@ -2394,6 +2400,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(type_upper + "_TABLES");
break;
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
defines.push_back(type_upper + "_LUT");
break;
default:
+3
View File
@@ -4056,6 +4056,7 @@ static bool ggml_webgpu_supported_qtype(ggml_type type) {
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
return true;
default:
return false;
@@ -4156,6 +4157,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
supports_op = true;
break;
default:
@@ -4196,6 +4198,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
supports_op = true;
break;
default:
@@ -896,9 +896,23 @@ const kvalues_iq4nl = array<i32, 16>(
#endif
#ifdef MXFP4_LUT
#if defined(MXFP4_LUT) || defined(NVFP4_LUT)
const kvalues_mxfp4 = array<i32, 16>(
0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12
);
#endif
#endif // MXFP4_LUT || NVFP4_LUT
#ifdef NVFP4_LUT
fn ue4m3_to_fp32(u: u32) -> f32 {
if (u == 0u || u == 127u) {
return 0.0;
}
let exp = (u >> 3u) & 15u;
let man = u & 7u;
if (exp == 0u) {
return f32(man) * (1.0 / 512.0);
}
let bits = ((exp + 120u) << 23u) | (man << 20u);
return bitcast<f32>(bits);
}
#endif // NVFP4_LUT
@@ -672,6 +672,27 @@ fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
}
#endif
#ifdef NVFP4
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
let block_byte_base = (src_base + offset) * 36;
let d_word = load_u32_at_src(block_byte_base);
for (var sub: u32 = 0u; sub < 4; sub++) {
let d = ue4m3_to_fp32(get_byte(d_word, sub)) * 0.5;
for (var j: u32 = 0u; j < 2; j++) {
let q_packed = load_u32_at_src(block_byte_base + 4 + sub * 8 + j * 4);
for (var k: u32 = 0; k < 4; k++) {
let q_byte = get_byte(q_packed, k);
let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * d;
let q_hi = f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * d;
let dst_offset = dst_base + offset * 64 + sub * 16 + j * 4 + k;
dst[dst_offset] = q_lo;
dst[dst_offset + 8u] = q_hi;
}
}
}
}
#endif
@group(0) @binding(0)
var<storage, read_write> src: array<SRC_TYPE>;
@@ -241,7 +241,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
#endif // INIT_SRC0_SHMEM_Q8_1
#if defined(INIT_SRC0_SHMEM_MXFP4)
let block_byte_base = src0_idx * 17u;
let block_byte_base = src0_idx * 17u; // BLOCK_SIZE_BYTES = 17u;
let eu8 = get_byte(load_u32_at_src0_aligned(block_byte_base), block_byte_base & 3u);
let e = ldexp(1.0, i32(eu8) - 128);
@@ -263,6 +263,47 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
}
#endif // legacy-quants
#if defined(INIT_SRC0_SHMEM_NVFP4)
const BLOCK_SIZE = 64u;
const BLOCK_SIZE_BYTES = 36u;
const SUB_BLOCK_SIZE = 16u; // elements sharing one UE4M3 scale
const NQ = 16u;
const BYTES_PER_THREAD = 8u;
const BYTES_PER_INNER_LOOP = 4u;
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
let tile_m = i / TILE_K;
let tile_k_start = i % TILE_K;
let global_m = offset_m + tile_m;
let global_k_start = k_outer + tile_k_start;
if (global_m >= params.m) {
break;
}
let block_k = global_k_start / BLOCK_SIZE;
let sub_block = (global_k_start % BLOCK_SIZE) / SUB_BLOCK_SIZE;
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
let d_byte_base = block_byte_base;
let qs_byte_base = block_byte_base + 4u;
let d = ue4m3_to_fp32(get_byte(load_u32_at_src0_aligned(d_byte_base), sub_block)) * 0.5;
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j++) {
let q_packed = load_u32_at_src0_aligned(qs_byte_base + sub_block * 8u + j * 4u);
for (var k = 0u; k < BYTES_PER_INNER_LOOP; k++) {
let q_byte = get_byte(q_packed, k);
shmem[i + j * BYTES_PER_INNER_LOOP + k] = f16(f32(kvalues_mxfp4[q_byte & 0xF]) * d);
shmem[i + j * BYTES_PER_INNER_LOOP + k + 8u] = f16(f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * d);
}
}
}
}
#endif // INIT_SRC0_SHMEM_NVFP4
// k-quants
#if defined(INIT_SRC0_SHMEM_Q2_K) || defined(INIT_SRC0_SHMEM_Q3_K) || defined(INIT_SRC0_SHMEM_Q4_K) || defined(INIT_SRC0_SHMEM_Q5_K) || defined(INIT_SRC0_SHMEM_Q6_K)
const BLOCK_SIZE = 256u;
@@ -1505,3 +1505,49 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src
return acc;
}
#endif
#ifdef MUL_ACC_NVFP4
#define BLOCK_SIZE 64
#define BLOCK_SIZE_BYTES 36
#define THREADS_PER_BLOCK 4
#define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK)
fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array<array<f32, OUTPUTS_PER_WG>, NUM_COLS> {
var acc: array<array<f32, OUTPUTS_PER_WG>, NUM_COLS>;
let num_blocks = params.k / BLOCK_SIZE;
let sub = thread_id % THREADS_PER_BLOCK;
for (var block = thread_id/THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE/THREADS_PER_BLOCK) {
let x_base = src1_idx_base + block * BLOCK_SIZE + sub * ELEMS_PER_THREAD;
var x_block: array<array<f32, ELEMS_PER_THREAD>, NUM_COLS>;
for (var col = 0u; col < NUM_COLS;col += 1) {
for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) {
x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]);
x_block[col][i + 8] = f32(src1[x_base + col * params.stride_11 + i + 8]);
}
}
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let output_row = row_base + row;
if (output_row < params.m) {
let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
let d = ue4m3_to_fp32(get_byte(load_u32_at_src0_aligned(block_byte_base), sub)) * 0.5;
let q_w0 = load_u32_at_src0_aligned(block_byte_base + 4u + 8u * sub);
let q_w1 = load_u32_at_src0_aligned(block_byte_base + 8u + 8u * sub);
for (var col = 0u;col < NUM_COLS;col += 1) {
var row_sum = 0.0;
for (var l = 0u; l < 8u; l++) {
let q_word = select(q_w0, q_w1, l >= 4u);
let q_byte = get_byte(q_word, l % 4u);
let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * d;
let q_hi = f32(kvalues_mxfp4[(q_byte >> 4u) & 0xFu]) * d;
row_sum += q_lo * x_block[col][l];
row_sum += q_hi * x_block[col][l + 8u];
}
acc[col][row] += row_sum;
}
}
}
}
return acc;
}
#endif
+2
View File
@@ -121,6 +121,8 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;
ggml_tensor * inpSA = inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+1
View File
@@ -7759,6 +7759,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 70000, 4, 1, false)); // row count > CUDA grid-y limit (65535)
for (ggml_type type : all_types) {
for (bool v : {false, true}) {
test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
+1 -1
View File
@@ -39,7 +39,7 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
throw std::runtime_error("unsupported URL scheme in target URL: " + parsed_url.scheme);
}
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), common_http_format_host(parsed_url.host).c_str(), parsed_url.port, parsed_url.path.c_str());
std::map<std::string, std::string> headers;
const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
+2 -1
View File
@@ -1,4 +1,5 @@
#include "common.h"
#include "http.h"
#include "server-http.h"
#include "server-stream.h"
#include "server-common.h"
@@ -441,7 +442,7 @@ bool server_http_context::start() {
srv->wait_until_ready();
listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
: string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port);
: string_format("%s://%s:%d", is_ssl ? "https" : "http", common_http_format_host(hostname).c_str(), port);
return true;
}
+3 -1
View File
@@ -1,4 +1,5 @@
#include "server-common.h"
#include "http.h"
#include "server-models.h"
#include "server-context.h"
#include "server-stream.h"
@@ -2263,7 +2264,8 @@ server_http_proxy::server_http_proxy(
}
if (lowered == "host") {
bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80);
req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port));
const std::string url_host = common_http_format_host(host);
req.set_header(key, is_default_port ? url_host : url_host + ":" + std::to_string(port));
} else {
req.set_header(key, value);
}