mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-05 20:15:53 +02:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0eca4d490e | |||
| 4f31eedb0c | |||
| 799fcc04a5 | |||
| 931eb37f8c | |||
| e495d1e748 | |||
| f708a5b2ca | |||
| d9df11006f | |||
| 6c5de1cc83 | |||
| 86b94708f2 |
+28
-6
@@ -11,6 +11,11 @@ struct common_http_url {
|
||||
std::string path;
|
||||
};
|
||||
|
||||
// bracket an IPv6 literal host for a URL authority (RFC 3986)
|
||||
static std::string common_http_format_host(const std::string & host) {
|
||||
return host.find(':') != std::string::npos ? "[" + host + "]" : host;
|
||||
}
|
||||
|
||||
static common_http_url common_http_parse_url(const std::string & url) {
|
||||
common_http_url parts;
|
||||
auto scheme_end = url.find("://");
|
||||
@@ -49,11 +54,28 @@ static common_http_url common_http_parse_url(const std::string & url) {
|
||||
parts.path = "/";
|
||||
}
|
||||
|
||||
auto colon_pos = parts.host.find(':');
|
||||
// split the authority into host and optional port, a bracketed IPv6 literal keeps its inner colons (RFC 3986)
|
||||
std::string port_str;
|
||||
if (!parts.host.empty() && parts.host.front() == '[') {
|
||||
auto close = parts.host.find(']');
|
||||
if (close == std::string::npos) {
|
||||
throw std::runtime_error("invalid IPv6 URL authority: " + parts.host);
|
||||
}
|
||||
auto after = parts.host.substr(close + 1);
|
||||
if (!after.empty() && after.front() == ':') {
|
||||
port_str = after.substr(1);
|
||||
}
|
||||
parts.host = parts.host.substr(1, close - 1);
|
||||
} else {
|
||||
auto colon_pos = parts.host.find(':');
|
||||
if (colon_pos != std::string::npos) {
|
||||
port_str = parts.host.substr(colon_pos + 1);
|
||||
parts.host = parts.host.substr(0, colon_pos);
|
||||
}
|
||||
}
|
||||
|
||||
if (colon_pos != std::string::npos) {
|
||||
parts.port = std::stoi(parts.host.substr(colon_pos + 1));
|
||||
parts.host = parts.host.substr(0, colon_pos);
|
||||
if (!port_str.empty()) {
|
||||
parts.port = std::stoi(port_str);
|
||||
} else if (parts.scheme == "http") {
|
||||
parts.port = 80;
|
||||
} else if (parts.scheme == "https") {
|
||||
@@ -83,7 +105,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
|
||||
}
|
||||
#endif
|
||||
|
||||
httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
|
||||
httplib::Client cli(parts.scheme + "://" + common_http_format_host(parts.host) + ":" + std::to_string(parts.port));
|
||||
|
||||
if (!parts.user.empty()) {
|
||||
cli.set_basic_auth(parts.user, parts.password);
|
||||
@@ -95,5 +117,5 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
|
||||
}
|
||||
|
||||
static std::string common_http_show_masked_url(const common_http_url & parts) {
|
||||
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
|
||||
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + common_http_format_host(parts.host) + parts.path;
|
||||
}
|
||||
|
||||
@@ -1551,8 +1551,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
|
||||
ggml_backend_synchronize(split_backend);
|
||||
|
||||
// copy the input tensors to the split backend
|
||||
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
||||
@@ -1563,15 +1561,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else if (!split_backend->iface.cpy_tensor_async) {
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else if (!split_backend->iface.cpy_tensor_async) {
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
|
||||
@@ -1676,8 +1674,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_synchronize(split_backend);
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
|
||||
@@ -664,7 +664,7 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
|
||||
template <int ncols1>
|
||||
__launch_bounds__(FATTN_KQ_STRIDE/2, 1)
|
||||
static __global__ void flash_attn_mask_to_KV_max(
|
||||
const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int s31, const int s33) {
|
||||
const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int64_t s31, const int64_t s33) {
|
||||
const int ne31 = gridDim.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int sequence = blockIdx.y;
|
||||
@@ -1089,8 +1089,8 @@ void launch_fattn(
|
||||
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
|
||||
// multiple sequences of possibly different lengths.
|
||||
if (mask && K->ne[1] % FATTN_KQ_STRIDE == 0 && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
|
||||
const int s31 = mask->nb[1] / sizeof(half2);
|
||||
const int s33 = mask->nb[3] / sizeof(half2);
|
||||
const int64_t s31 = mask->nb[1] / sizeof(half2);
|
||||
const int64_t s33 = mask->nb[3] / sizeof(half2);
|
||||
|
||||
const dim3 blocks_num_KV_max(ntiles_x, Q->ne[3], 1);
|
||||
const dim3 block_dim_KV_max(FATTN_KQ_STRIDE/2, 1, 1);
|
||||
|
||||
@@ -2003,6 +2003,10 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112, 64)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128, 64)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256, 64)
|
||||
|
||||
extern DECL_FATTN_MMA_F16_CASE(512, 512, 4, 2);
|
||||
extern DECL_FATTN_MMA_F16_CASE(512, 512, 8, 2);
|
||||
extern DECL_FATTN_MMA_F16_CASE(512, 512, 16, 2);
|
||||
extern DECL_FATTN_MMA_F16_CASE(512, 512, 32, 2);
|
||||
extern DECL_FATTN_MMA_F16_CASE(512, 512, 2, 4);
|
||||
extern DECL_FATTN_MMA_F16_CASE(512, 512, 4, 4);
|
||||
extern DECL_FATTN_MMA_F16_CASE(512, 512, 8, 4);
|
||||
|
||||
@@ -76,6 +76,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
|
||||
@@ -144,6 +145,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 32, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 32, 64)
|
||||
@@ -219,6 +221,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 512, 1, 128, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
|
||||
@@ -296,6 +299,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 128, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 4, 64, 64)
|
||||
@@ -1308,12 +1312,12 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (DV <= 256) {
|
||||
if (use_gqa_opt && gqa_ratio % 2 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
if (use_gqa_opt && gqa_ratio % 2 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (DV <= 256) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 1, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -99,12 +99,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (DKQ <= 256) {
|
||||
if (use_gqa_opt && gqa_ratio > 1) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
if (use_gqa_opt && gqa_ratio > 1) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (DKQ <= 256) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
@@ -78,26 +78,29 @@ static __global__ void k_get_rows_float(
|
||||
|
||||
template<typename grad_t, typename dst_t>
|
||||
static __global__ void k_get_rows_back_float(
|
||||
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
|
||||
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst,
|
||||
const int64_t ncols, const int64_t nrows_grad, const int64_t nrows_dst) {
|
||||
const int col = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if (col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int64_t i = 0; i < nrows_grad; ++i) {
|
||||
if (rows[i] != dst_row) {
|
||||
continue;
|
||||
}
|
||||
sum += grad[i*ncols + col];
|
||||
}
|
||||
|
||||
dst[dst_row*ncols + col] = sum;
|
||||
// grid.y is clamped to the CUDA grid limit, so stride over the destination rows
|
||||
for (int64_t dst_row = blockIdx.y; dst_row < nrows_dst; dst_row += gridDim.y) {
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int64_t i = 0; i < nrows_grad; ++i) {
|
||||
if (rows[i] != dst_row) {
|
||||
continue;
|
||||
}
|
||||
sum += grad[i*ncols + col];
|
||||
}
|
||||
|
||||
dst[dst_row*ncols + col] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
|
||||
@@ -302,7 +305,7 @@ void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * d
|
||||
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
|
||||
const dim3 block_nums(block_num_x, ne1, 1);
|
||||
const dim3 block_nums(block_num_x, MIN(ne1, (int64_t)UINT16_MAX), 1);
|
||||
|
||||
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
|
||||
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10, ne1);
|
||||
}
|
||||
|
||||
@@ -3192,24 +3192,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
|
||||
// Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
|
||||
// Excluding this path for HIP and MUSA as a precaution.
|
||||
// According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways.
|
||||
// Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior.
|
||||
// e.g. https://github.com/ROCm/rocm-systems/issues/5109
|
||||
// It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA.
|
||||
// This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends.
|
||||
#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
|
||||
const bool copy_from_host = false;
|
||||
#else
|
||||
const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
#endif
|
||||
|
||||
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
|
||||
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3220,17 +3207,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
|
||||
|
||||
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
|
||||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
|
||||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||
#endif // NDEBUG
|
||||
return false;
|
||||
}
|
||||
|
||||
if (copy_from_host) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
|
||||
} else if (backend_src != backend_dst) {
|
||||
if (backend_src != backend_dst) {
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
|
||||
@@ -368,5 +368,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
return true;
|
||||
}
|
||||
|
||||
// gfx900 (Vega 10) lacks native dp4a, loses to dequant + hipBLAS
|
||||
// for dense matrices; keep MMQ only for MoE, where the
|
||||
// hipBLAS path is much slower.
|
||||
if (cc == GGML_CUDA_CC_VEGA) {
|
||||
return n_experts > 0;
|
||||
}
|
||||
|
||||
return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 16, 2);
|
||||
|
||||
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 32, 2);
|
||||
|
||||
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 4, 2);
|
||||
|
||||
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 8, 2);
|
||||
|
||||
@@ -92,7 +92,7 @@ for ncols in [8, 16, 32, 64]:
|
||||
continue
|
||||
if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
|
||||
continue
|
||||
if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
|
||||
if head_size_kq == 512 and ncols2 not in (2, 4, 8): # Gemma 4 (+ MTP)
|
||||
continue
|
||||
if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
|
||||
continue
|
||||
|
||||
@@ -2475,6 +2475,85 @@ static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count
|
||||
return true;
|
||||
}
|
||||
|
||||
// Remove the loop unrolling hint of the matmul shader's BK loop
|
||||
// and replace it with the dont_unroll hint for better performance on
|
||||
// hardware like Apple M1/M2.
|
||||
// Assumes 1. code comes from mul_mm.comp 2. the K-tile loop has no loop
|
||||
// control hint and 3. the BK loop is the last loop nested directly inside
|
||||
// the K-tile loop.
|
||||
// Returns true when the input was modified; returns false otherwise
|
||||
// without touching `out`.
|
||||
static bool ggml_vk_roll_bk_loop(const uint32_t * code, size_t word_count, std::vector<uint32_t> & out) {
|
||||
if (word_count < 5) {
|
||||
return false;
|
||||
}
|
||||
|
||||
struct vk_spv_loop {
|
||||
size_t header;
|
||||
size_t end;
|
||||
uint32_t control;
|
||||
};
|
||||
|
||||
std::vector<vk_spv_loop> loops;
|
||||
|
||||
// Collect a list of all loops in the module.
|
||||
for (size_t pos = 5; pos < word_count; ) {
|
||||
const uint32_t wc = code[pos] >> spv::WordCountShift;
|
||||
const uint32_t op = code[pos] & spv::OpCodeMask;
|
||||
if (wc == 0 || pos + wc > word_count) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (op == spv::OpLoopMerge && wc >= 4) { loops.push_back({ pos, 0, code[pos + 3] }); }
|
||||
|
||||
if (op == spv::OpLabel && wc >= 2) {
|
||||
for (auto & l : loops) {
|
||||
if (l.end == 0 && code[l.header + 1] == code[pos + 1]) { l.end = pos; }
|
||||
}
|
||||
}
|
||||
|
||||
pos += wc;
|
||||
}
|
||||
|
||||
auto encloses = [](const vk_spv_loop & a, const vk_spv_loop & b) {
|
||||
return a.header < b.header && b.header < a.end;
|
||||
};
|
||||
|
||||
// Find the BK loop.
|
||||
const vk_spv_loop * bk = nullptr;
|
||||
for (const auto & h : loops) {
|
||||
if (h.control != spv::LoopControlUnrollMask) {
|
||||
continue;
|
||||
}
|
||||
const vk_spv_loop * parent = nullptr;
|
||||
bool has_child = false;
|
||||
for (const auto & g : loops) {
|
||||
if (encloses(g, h) && (!parent || g.header > parent->header)) {
|
||||
parent = &g;
|
||||
}
|
||||
if (encloses(h, g)) {
|
||||
has_child = true;
|
||||
}
|
||||
}
|
||||
// BK loop should be the last loop nested inside the loop with no hint
|
||||
// and have at least one child loop.
|
||||
if (parent &&
|
||||
parent->control == spv::LoopControlMaskNone &&
|
||||
has_child &&
|
||||
(!bk || h.header > bk->header)) {
|
||||
bk = &h;
|
||||
}
|
||||
}
|
||||
if (!bk) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// set DontUnroll instead of Unroll
|
||||
out.assign(code, code + word_count);
|
||||
out[bk->header + 3] = spv::LoopControlDontUnrollMask;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint,
|
||||
uint32_t parameter_count, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
|
||||
bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
|
||||
@@ -2558,6 +2637,22 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
}
|
||||
#endif
|
||||
|
||||
#if VK_HEADER_VERSION >= 287
|
||||
// Roll the mul_mm BK loop on Asahi Linux. Skip bf16 and the mul_mmq pipelines.
|
||||
if (device->driver_id == vk::DriverId::eMesaHoneykrisp &&
|
||||
pipeline->name.rfind("matmul", 0) == 0 &&
|
||||
pipeline->name.find("bf16") == std::string::npos &&
|
||||
pipeline->name.find("q8_1") == std::string::npos) {
|
||||
const uint32_t * src = spirv.empty() ? reinterpret_cast<const uint32_t *>(spv_data) : spirv.data();
|
||||
size_t src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size();
|
||||
std::vector<uint32_t> rolled;
|
||||
if (ggml_vk_roll_bk_loop(src, src_n, rolled)) {
|
||||
spirv = std::move(rolled);
|
||||
shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
||||
|
||||
vk::PushConstantRange pcr(
|
||||
|
||||
@@ -1563,6 +1563,7 @@ class ggml_webgpu_shader_lib {
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
{
|
||||
// Quantized types using u32 buffers for portability.
|
||||
defines.push_back("SRC_TYPE=u32");
|
||||
@@ -1593,6 +1594,8 @@ class ggml_webgpu_shader_lib {
|
||||
} else if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
|
||||
key.src_type == GGML_TYPE_IQ4_NL || key.src_type == GGML_TYPE_MXFP4) {
|
||||
defines.push_back("BLOCK_SIZE=32u");
|
||||
} else if (key.src_type == GGML_TYPE_NVFP4) {
|
||||
defines.push_back("BLOCK_SIZE=64u");
|
||||
} else if (key.src_type >= GGML_TYPE_Q2_K) {
|
||||
defines.push_back("BLOCK_SIZE=256u");
|
||||
} else {
|
||||
@@ -1960,6 +1963,7 @@ class ggml_webgpu_shader_lib {
|
||||
defines.push_back(type_upper + "_TABLES");
|
||||
break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
defines.push_back(type_upper + "_LUT");
|
||||
break;
|
||||
default:
|
||||
@@ -2103,6 +2107,7 @@ class ggml_webgpu_shader_lib {
|
||||
defines.push_back(type_upper + "_TABLES");
|
||||
break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
defines.push_back(type_upper + "_LUT");
|
||||
break;
|
||||
default:
|
||||
@@ -2274,6 +2279,7 @@ class ggml_webgpu_shader_lib {
|
||||
defines.push_back(type_upper + "_TABLES");
|
||||
break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
defines.push_back(type_upper + "_LUT");
|
||||
break;
|
||||
default:
|
||||
@@ -2394,6 +2400,7 @@ class ggml_webgpu_shader_lib {
|
||||
defines.push_back(type_upper + "_TABLES");
|
||||
break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
defines.push_back(type_upper + "_LUT");
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -4056,6 +4056,7 @@ static bool ggml_webgpu_supported_qtype(ggml_type type) {
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -4156,6 +4157,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
supports_op = true;
|
||||
break;
|
||||
default:
|
||||
@@ -4196,6 +4198,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
supports_op = true;
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -896,9 +896,23 @@ const kvalues_iq4nl = array<i32, 16>(
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef MXFP4_LUT
|
||||
#if defined(MXFP4_LUT) || defined(NVFP4_LUT)
|
||||
const kvalues_mxfp4 = array<i32, 16>(
|
||||
0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12
|
||||
);
|
||||
#endif
|
||||
#endif // MXFP4_LUT || NVFP4_LUT
|
||||
|
||||
#ifdef NVFP4_LUT
|
||||
fn ue4m3_to_fp32(u: u32) -> f32 {
|
||||
if (u == 0u || u == 127u) {
|
||||
return 0.0;
|
||||
}
|
||||
let exp = (u >> 3u) & 15u;
|
||||
let man = u & 7u;
|
||||
if (exp == 0u) {
|
||||
return f32(man) * (1.0 / 512.0);
|
||||
}
|
||||
let bits = ((exp + 120u) << 23u) | (man << 20u);
|
||||
return bitcast<f32>(bits);
|
||||
}
|
||||
#endif // NVFP4_LUT
|
||||
|
||||
@@ -672,6 +672,27 @@ fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef NVFP4
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block_byte_base = (src_base + offset) * 36;
|
||||
let d_word = load_u32_at_src(block_byte_base);
|
||||
for (var sub: u32 = 0u; sub < 4; sub++) {
|
||||
let d = ue4m3_to_fp32(get_byte(d_word, sub)) * 0.5;
|
||||
for (var j: u32 = 0u; j < 2; j++) {
|
||||
let q_packed = load_u32_at_src(block_byte_base + 4 + sub * 8 + j * 4);
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * d;
|
||||
let q_hi = f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * d;
|
||||
let dst_offset = dst_base + offset * 64 + sub * 16 + j * 4 + k;
|
||||
dst[dst_offset] = q_lo;
|
||||
dst[dst_offset + 8u] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src: array<SRC_TYPE>;
|
||||
|
||||
@@ -241,7 +241,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
|
||||
#endif // INIT_SRC0_SHMEM_Q8_1
|
||||
|
||||
#if defined(INIT_SRC0_SHMEM_MXFP4)
|
||||
let block_byte_base = src0_idx * 17u;
|
||||
let block_byte_base = src0_idx * 17u; // BLOCK_SIZE_BYTES = 17u;
|
||||
let eu8 = get_byte(load_u32_at_src0_aligned(block_byte_base), block_byte_base & 3u);
|
||||
let e = ldexp(1.0, i32(eu8) - 128);
|
||||
|
||||
@@ -263,6 +263,47 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
|
||||
}
|
||||
#endif // legacy-quants
|
||||
|
||||
#if defined(INIT_SRC0_SHMEM_NVFP4)
|
||||
const BLOCK_SIZE = 64u;
|
||||
const BLOCK_SIZE_BYTES = 36u;
|
||||
const SUB_BLOCK_SIZE = 16u; // elements sharing one UE4M3 scale
|
||||
const NQ = 16u;
|
||||
const BYTES_PER_THREAD = 8u;
|
||||
const BYTES_PER_INNER_LOOP = 4u;
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let tile_m = i / TILE_K;
|
||||
let tile_k_start = i % TILE_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let global_k_start = k_outer + tile_k_start;
|
||||
|
||||
if (global_m >= params.m) {
|
||||
break;
|
||||
}
|
||||
|
||||
let block_k = global_k_start / BLOCK_SIZE;
|
||||
let sub_block = (global_k_start % BLOCK_SIZE) / SUB_BLOCK_SIZE;
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
|
||||
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
let d_byte_base = block_byte_base;
|
||||
let qs_byte_base = block_byte_base + 4u;
|
||||
|
||||
let d = ue4m3_to_fp32(get_byte(load_u32_at_src0_aligned(d_byte_base), sub_block)) * 0.5;
|
||||
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j++) {
|
||||
let q_packed = load_u32_at_src0_aligned(qs_byte_base + sub_block * 8u + j * 4u);
|
||||
for (var k = 0u; k < BYTES_PER_INNER_LOOP; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
shmem[i + j * BYTES_PER_INNER_LOOP + k] = f16(f32(kvalues_mxfp4[q_byte & 0xF]) * d);
|
||||
shmem[i + j * BYTES_PER_INNER_LOOP + k + 8u] = f16(f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_NVFP4
|
||||
|
||||
// k-quants
|
||||
#if defined(INIT_SRC0_SHMEM_Q2_K) || defined(INIT_SRC0_SHMEM_Q3_K) || defined(INIT_SRC0_SHMEM_Q4_K) || defined(INIT_SRC0_SHMEM_Q5_K) || defined(INIT_SRC0_SHMEM_Q6_K)
|
||||
const BLOCK_SIZE = 256u;
|
||||
|
||||
@@ -1505,3 +1505,49 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src
|
||||
return acc;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MUL_ACC_NVFP4
|
||||
#define BLOCK_SIZE 64
|
||||
#define BLOCK_SIZE_BYTES 36
|
||||
#define THREADS_PER_BLOCK 4
|
||||
#define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK)
|
||||
fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array<array<f32, OUTPUTS_PER_WG>, NUM_COLS> {
|
||||
var acc: array<array<f32, OUTPUTS_PER_WG>, NUM_COLS>;
|
||||
|
||||
let num_blocks = params.k / BLOCK_SIZE;
|
||||
let sub = thread_id % THREADS_PER_BLOCK;
|
||||
for (var block = thread_id/THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE/THREADS_PER_BLOCK) {
|
||||
let x_base = src1_idx_base + block * BLOCK_SIZE + sub * ELEMS_PER_THREAD;
|
||||
var x_block: array<array<f32, ELEMS_PER_THREAD>, NUM_COLS>;
|
||||
for (var col = 0u; col < NUM_COLS;col += 1) {
|
||||
for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) {
|
||||
x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]);
|
||||
x_block[col][i + 8] = f32(src1[x_base + col * params.stride_11 + i + 8]);
|
||||
}
|
||||
}
|
||||
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
|
||||
let output_row = row_base + row;
|
||||
if (output_row < params.m) {
|
||||
let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
|
||||
let d = ue4m3_to_fp32(get_byte(load_u32_at_src0_aligned(block_byte_base), sub)) * 0.5;
|
||||
let q_w0 = load_u32_at_src0_aligned(block_byte_base + 4u + 8u * sub);
|
||||
let q_w1 = load_u32_at_src0_aligned(block_byte_base + 8u + 8u * sub);
|
||||
for (var col = 0u;col < NUM_COLS;col += 1) {
|
||||
var row_sum = 0.0;
|
||||
for (var l = 0u; l < 8u; l++) {
|
||||
let q_word = select(q_w0, q_w1, l >= 4u);
|
||||
let q_byte = get_byte(q_word, l % 4u);
|
||||
let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * d;
|
||||
let q_hi = f32(kvalues_mxfp4[(q_byte >> 4u) & 0xFu]) * d;
|
||||
row_sum += q_lo * x_block[col][l];
|
||||
row_sum += q_hi * x_block[col][l + 8u];
|
||||
}
|
||||
acc[col][row] += row_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return acc;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -121,6 +121,8 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
|
||||
@@ -7759,6 +7759,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
|
||||
test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 70000, 4, 1, false)); // row count > CUDA grid-y limit (65535)
|
||||
for (ggml_type type : all_types) {
|
||||
for (bool v : {false, true}) {
|
||||
test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
|
||||
|
||||
@@ -39,7 +39,7 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
|
||||
throw std::runtime_error("unsupported URL scheme in target URL: " + parsed_url.scheme);
|
||||
}
|
||||
|
||||
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
|
||||
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), common_http_format_host(parsed_url.host).c_str(), parsed_url.port, parsed_url.path.c_str());
|
||||
|
||||
std::map<std::string, std::string> headers;
|
||||
const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "common.h"
|
||||
#include "http.h"
|
||||
#include "server-http.h"
|
||||
#include "server-stream.h"
|
||||
#include "server-common.h"
|
||||
@@ -441,7 +442,7 @@ bool server_http_context::start() {
|
||||
srv->wait_until_ready();
|
||||
|
||||
listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
|
||||
: string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port);
|
||||
: string_format("%s://%s:%d", is_ssl ? "https" : "http", common_http_format_host(hostname).c_str(), port);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "server-common.h"
|
||||
#include "http.h"
|
||||
#include "server-models.h"
|
||||
#include "server-context.h"
|
||||
#include "server-stream.h"
|
||||
@@ -2263,7 +2264,8 @@ server_http_proxy::server_http_proxy(
|
||||
}
|
||||
if (lowered == "host") {
|
||||
bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80);
|
||||
req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port));
|
||||
const std::string url_host = common_http_format_host(host);
|
||||
req.set_header(key, is_default_port ? url_host : url_host + ":" + std::to_string(port));
|
||||
} else {
|
||||
req.set_header(key, value);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user