Compare commits

..

4 Commits

Author SHA1 Message Date
Winston Ma 32120c10e3 vulkan: prefer host-visible memory buffers on UMA devices (#22930)
* implement UMA host-visible memory

* update based on 0cc4m's suggestion
2026-06-16 09:36:52 +02:00
Jeff Bolz d5fb104293 vulkan: Support gated_delta_net with S_v=16 (#24581) 2026-06-16 09:26:57 +02:00
Ruixiang Wang 635b65ad7a spec: add spec metrics mean acceptance length and acceptance rate per position (#24536)
* spec: add spec metrics mean acceptance length and acceptance per pos

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-06-16 10:23:09 +03:00
Adrien Gallouët e3a74b2990 bench : add --offline (#24511)
* bench : add --offline

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add default

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

---------

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-16 08:26:05 +02:00
5 changed files with 103 additions and 18 deletions
+30 -1
View File
@@ -140,6 +140,8 @@ struct common_speculative_impl {
size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
std::vector<size_t> n_acc_tokens_per_pos; // number of tokens accepted per draft position.
// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
@@ -2059,6 +2061,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
{
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
if (impl->n_acc_tokens_per_pos.size() < n_accepted) {
impl->n_acc_tokens_per_pos.resize(n_accepted, 0);
}
for (size_t i = 0; i < n_accepted; ++i) {
impl->n_acc_tokens_per_pos[i]++;
}
if (n_accepted > 0) {
impl->n_acc_drafts++;
impl->n_acc_tokens += n_accepted;
@@ -2093,13 +2104,31 @@ void common_speculative_print_stats(const common_speculative * spec) {
str_perf = "";
}
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
std::string str_stats;
if (impl->n_call_accept > 0) {
const double mean =
1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept;
std::ostringstream tmp;
tmp << std::fixed << std::setprecision(3);
for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) {
if (i > 0) {
tmp << ", ";
}
tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept;
}
std::ostringstream oss;
oss << std::fixed << std::setprecision(2) << mean;
str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
}
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
common_speculative_type_to_str(impl->type).c_str(),
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
impl->n_gen_drafts,
impl->n_acc_drafts,
impl->n_gen_tokens,
impl->n_acc_tokens,
str_stats.c_str(),
str_perf.c_str());
}
}
+36 -14
View File
@@ -911,8 +911,8 @@ struct vk_device_struct {
vk_pipeline pipeline_pool2d_f32;
vk_pipeline pipeline_rwkv_wkv6_f32;
vk_pipeline pipeline_rwkv_wkv7_f32;
// [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128
vk_pipeline pipeline_gated_delta_net[3][2];
// [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128
vk_pipeline pipeline_gated_delta_net[4][2];
vk_pipeline pipeline_ssm_scan_f32_d128;
vk_pipeline pipeline_ssm_scan_f32_d256;
vk_pipeline pipeline_ssm_conv_f32;
@@ -3080,8 +3080,10 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal});
} else if (device->uma) {
// Fall back to host memory type
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
// On UMA, prefer host-visible memory so direct tensor borrowing works.
// If unavailable, fall back to device-local memory.
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
} else if (device->disable_host_visible_vidmem) {
if (device->allow_sysmem_fallback) {
@@ -5231,14 +5233,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
{
const uint32_t gdn_sizes[] = {32, 64, 128};
const uint32_t gdn_sizes[] = {16, 32, 64, 128};
const char * gdn_names[][2] = {
{"gated_delta_net_f32_d16", "gated_delta_net_f32_d16_kda"},
{"gated_delta_net_f32_d32", "gated_delta_net_f32_d32_kda"},
{"gated_delta_net_f32_d64", "gated_delta_net_f32_d64_kda"},
{"gated_delta_net_f32_d128", "gated_delta_net_f32_d128_kda"},
};
const bool use_subgroup_reduce = device->subgroup_arithmetic;
for (uint32_t si = 0; si < 3; si++) {
for (uint32_t si = 0; si < 4; si++) {
const uint32_t S_V = gdn_sizes[si];
GGML_ASSERT(is_pow2(S_V));
@@ -5252,10 +5254,29 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
lanes_per_column = std::min(S_V, device->subgroup_size);
}
const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size);
// gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and
// S_V % LANES_PER_COLUMN == 0 to avoid bounds checks.
while (lanes_per_column > 1u) {
const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 &&
(S_V % lanes_per_column) == 0;
const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0;
if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) {
break;
}
lanes_per_column >>= 1u;
}
GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0);
GGML_ASSERT((S_V % lanes_per_column) == 0);
GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0);
const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size;
const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce;
const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce;
const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce;
size_t gdn_len;
const void * gdn_data;
if (use_subgroup_reduce && need_clustered_shader) {
if (use_clustered_reduce) {
gdn_len = gated_delta_net_f32_len;
gdn_data = (const void *)gated_delta_net_f32_data;
} else if (use_subgroup_reduce) {
@@ -5272,7 +5293,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
for (uint32_t kda = 0; kda < 2; kda++) {
ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda],
gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants),
wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size);
wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size);
}
}
}
@@ -10746,9 +10767,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0;
uint32_t si;
switch (S_v) {
case 32: si = 0; break;
case 64: si = 1; break;
case 128: si = 2; break;
case 16: si = 0; break;
case 32: si = 1; break;
case 64: si = 2; break;
case 128: si = 3; break;
default: return nullptr;
}
return ctx->device->pipeline_gated_delta_net[si][kda];
@@ -17193,7 +17215,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_GATED_DELTA_NET:
{
const uint32_t S_v = op->src[2]->ne[0];
if (S_v != 32 && S_v != 64 && S_v != 128) {
if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) {
return false;
}
for (int i = 0; i < 6; i++) {
+7
View File
@@ -323,6 +323,7 @@ struct cmd_params {
std::vector<std::string> hf_repo;
std::vector<std::string> hf_file;
std::string hf_token;
bool offline;
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
@@ -367,6 +368,7 @@ static const cmd_params cmd_params_defaults = {
/* hf_repo */ {},
/* hf_file */ {},
/* hf_token */ "",
/* offline */ false,
/* n_prompt */ { 512 },
/* n_gen */ { 128 },
/* n_pg */ {},
@@ -437,6 +439,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" (default: unused)\n");
printf(" -hft, --hf-token <token> Hugging Face access token\n");
printf(" (default: value from HF_TOKEN environment variable)\n");
printf(" --offline Offline mode: forces use of cache, prevents network access\n");
printf(" (default: disabled)\n");
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
@@ -558,6 +562,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.hf_token = argv[i];
} else if (arg == "--offline") {
params.offline = true;
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -1040,6 +1046,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
common_download_opts opts;
opts.bearer_token = params.hf_token;
opts.offline = params.offline;
auto download_result = common_download_model(model, opts);
if (download_result.model_path.empty()) {
fprintf(stderr, "error: failed to download model from HuggingFace\n");
+3
View File
@@ -40,6 +40,7 @@ def main(args_in: list[str] | None = None) -> None:
required=True)
parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
parser.add_argument("--offline", action="store_true", default=False, help="Offline mode: forces use of cache, prevents network access")
parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
@@ -268,6 +269,8 @@ def start_server_background(args):
]
server_args.extend(['--hf-repo', args.hf_repo])
server_args.extend(['--hf-file', args.hf_file])
if args.offline:
server_args.append('--offline')
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
server_args.extend(['--ctx-size', args.ctx_size])
server_args.extend(['--parallel', args.parallel])
+27 -3
View File
@@ -201,6 +201,8 @@ struct server_slot {
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model
std::vector<int32_t> n_accepted_per_pos; // Accepted tokens per draft position
void reset() {
SLT_DBG(*this, "%s", "\n");
@@ -227,6 +229,8 @@ struct server_slot {
// clear speculative decoding stats
n_draft_total = 0;
n_draft_accepted = 0;
n_draft_verif_steps = 0;
n_accepted_per_pos.clear();
task_prev = std::move(task);
task.reset();
@@ -509,10 +513,22 @@ struct server_slot {
llama_perf_context(ctx_tgt).n_reused);
if (n_draft_total > 0) {
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0;
std::string acceptance_rates_per_pos;
if (n_draft_verif_steps > 0) {
for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) {
if (i > 0) {
acceptance_rates_per_pos += ", ";
}
acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps);
}
}
SLT_INF(*this,
"draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
draft_ratio, n_draft_accepted, n_draft_total);
"draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n",
draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str());
}
common_speculative_print_stats(spec);
@@ -3543,6 +3559,14 @@ private:
// update how many tokens out of those tested were accepted
slot.n_draft_accepted += ids.size() - 1;
slot.n_draft_verif_steps += 1;
if (slot.n_accepted_per_pos.empty()) {
slot.n_accepted_per_pos.resize(common_speculative_n_max(&params_base.speculative), 0);
}
for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) {
slot.n_accepted_per_pos[i]++;
}
// add accepted tokens to the prompt
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);