Compare commits

...

7 Commits

Author SHA1 Message Date
Alessandro de Oliveira Faria (A.K.A.CABELO) 74ade52741 vendor : update BoringSSL to 0.20260616.0 (#24693) 2026-06-16 20:24:28 +02:00
Pascal c1304d7b28 ui: add source toggle to mermaid and svg blocks (#24652)
* ui: add source toggle to mermaid and svg blocks

Add a toggle button next to copy and preview that switches a rendered
mermaid or svg block to its source code and back. The button is shared by
both block types and the rendered view stays the default.

The source view reuses the code block scroll container and the highlighted
code element captured at transform time, so it matches the app code blocks
without highlighting again.

Make tall diagrams scroll like text code blocks: safe centering keeps the
diagram centered when it fits and falls back to start alignment when it
overflows, so the top stays reachable instead of clipping above.

Keep the block header opaque and layered above the scrolled diagram, and
ignore header clicks in the zoom handler, so a button click never falls
through to the zoom dialog.

* ui: transparent diagram block header, address review from @allozaur
2026-06-16 14:14:22 +02:00
Oliver Simons 02810c7aa8 Fix and restrict NVFP4 edge-cases in llama-graph (#24331)
* Move post-GEMM MUL required for dequant b4 lora and bias add

see https://github.com/ggml-org/llama.cpp/pull/23484 :
1. For lora, I would presume we want fully dequantized values before
   doing the residuals, but this depends on how the LORAs were
generated. Literature tells me LORA happens post-mul but pre-bias add https://github.com/ggml-org/llama.cpp/pull/8332
2. For ModelOPT, bias-add should happen on [fully-dequantized
   values](https://github.com/NVIDIA/Model-Optimizer/blob/b49f9b9e2d747af992d78a3aa7f10efe5a8847e1/modelopt/torch/quantization/backends/nvfp4_gemm.py#L59-L64)

* Restrict build_ffn for NVFP4 to supported combinations
2026-06-16 11:52:38 +02:00
Ruixiang Wang a1824902b5 spec: add backend sampling support for eagle3 (#24655) 2026-06-16 12:05:52 +03:00
Winston Ma 32120c10e3 vulkan: prefer host-visible memory buffers on UMA devices (#22930)
* implement UMA host-visible memory

* update based on 0cc4m's suggestion
2026-06-16 09:36:52 +02:00
Jeff Bolz d5fb104293 vulkan: Support gated_delta_net with S_v=16 (#24581) 2026-06-16 09:26:57 +02:00
Ruixiang Wang 635b65ad7a spec: add spec metrics mean acceptance length and acceptance rate per position (#24536)
* spec: add spec metrics mean acceptance length and acceptance per pos

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-06-16 10:23:09 +03:00
15 changed files with 379 additions and 88 deletions
+62 -2
View File
@@ -140,6 +140,8 @@ struct common_speculative_impl {
size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
std::vector<size_t> n_acc_tokens_per_pos; // number of tokens accepted per draft position.
// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
@@ -416,6 +418,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
std::vector<common_sampler_ptr> smpls;
// backend sampler chain per seq, attached to ctx_dft
std::vector<llama_sampler *> backend_chains;
int32_t n_embd_dec = 0; // draft hidden size
int32_t n_embd_enc = 0; // target_layer_ids_n * target_hidden_size
int32_t n_embd_tgt = 0; // target model hidden size
@@ -441,7 +446,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
, params(params.draft)
{
LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f, backend_sampling=%d\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min, (int) params.draft.backend_sampling);
auto * ctx_tgt = this->params.ctx_tgt;
auto * ctx_dft = this->params.ctx_dft;
@@ -476,6 +481,22 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
}
// offload draft sampling to the backend
backend_chains.assign(n_seq, nullptr);
if (this->params.backend_sampling) {
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
llama_sampler_free(chain);
chain = nullptr;
}
backend_chains[seq_id] = chain;
}
}
// turn on extraction of the target layers' input embeddings
for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
@@ -494,6 +515,18 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
}
~common_speculative_impl_draft_eagle3() override {
auto * ctx_dft = this->params.ctx_dft;
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
if (backend_chains[seq_id] == nullptr) {
continue;
}
if (ctx_dft) {
llama_set_sampler(ctx_dft, seq_id, nullptr);
}
llama_sampler_free(backend_chains[seq_id]);
}
backend_chains.clear();
if (batch.token != nullptr) {
free(batch.token);
batch.token = nullptr;
@@ -2059,6 +2092,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
{
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
if (impl->n_acc_tokens_per_pos.size() < n_accepted) {
impl->n_acc_tokens_per_pos.resize(n_accepted, 0);
}
for (size_t i = 0; i < n_accepted; ++i) {
impl->n_acc_tokens_per_pos[i]++;
}
if (n_accepted > 0) {
impl->n_acc_drafts++;
impl->n_acc_tokens += n_accepted;
@@ -2093,13 +2135,31 @@ void common_speculative_print_stats(const common_speculative * spec) {
str_perf = "";
}
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
std::string str_stats;
if (impl->n_call_accept > 0) {
const double mean =
1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept;
std::ostringstream tmp;
tmp << std::fixed << std::setprecision(3);
for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) {
if (i > 0) {
tmp << ", ";
}
tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept;
}
std::ostringstream oss;
oss << std::fixed << std::setprecision(2) << mean;
str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
}
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
common_speculative_type_to_str(impl->type).c_str(),
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
impl->n_gen_drafts,
impl->n_acc_drafts,
impl->n_gen_tokens,
impl->n_acc_tokens,
str_stats.c_str(),
str_perf.c_str());
}
}
+36 -14
View File
@@ -911,8 +911,8 @@ struct vk_device_struct {
vk_pipeline pipeline_pool2d_f32;
vk_pipeline pipeline_rwkv_wkv6_f32;
vk_pipeline pipeline_rwkv_wkv7_f32;
// [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128
vk_pipeline pipeline_gated_delta_net[3][2];
// [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128
vk_pipeline pipeline_gated_delta_net[4][2];
vk_pipeline pipeline_ssm_scan_f32_d128;
vk_pipeline pipeline_ssm_scan_f32_d256;
vk_pipeline pipeline_ssm_conv_f32;
@@ -3080,8 +3080,10 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal});
} else if (device->uma) {
// Fall back to host memory type
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
// On UMA, prefer host-visible memory so direct tensor borrowing works.
// If unavailable, fall back to device-local memory.
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
} else if (device->disable_host_visible_vidmem) {
if (device->allow_sysmem_fallback) {
@@ -5231,14 +5233,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
{
const uint32_t gdn_sizes[] = {32, 64, 128};
const uint32_t gdn_sizes[] = {16, 32, 64, 128};
const char * gdn_names[][2] = {
{"gated_delta_net_f32_d16", "gated_delta_net_f32_d16_kda"},
{"gated_delta_net_f32_d32", "gated_delta_net_f32_d32_kda"},
{"gated_delta_net_f32_d64", "gated_delta_net_f32_d64_kda"},
{"gated_delta_net_f32_d128", "gated_delta_net_f32_d128_kda"},
};
const bool use_subgroup_reduce = device->subgroup_arithmetic;
for (uint32_t si = 0; si < 3; si++) {
for (uint32_t si = 0; si < 4; si++) {
const uint32_t S_V = gdn_sizes[si];
GGML_ASSERT(is_pow2(S_V));
@@ -5252,10 +5254,29 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
lanes_per_column = std::min(S_V, device->subgroup_size);
}
const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size);
// gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and
// S_V % LANES_PER_COLUMN == 0 to avoid bounds checks.
while (lanes_per_column > 1u) {
const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 &&
(S_V % lanes_per_column) == 0;
const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0;
if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) {
break;
}
lanes_per_column >>= 1u;
}
GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0);
GGML_ASSERT((S_V % lanes_per_column) == 0);
GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0);
const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size;
const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce;
const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce;
const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce;
size_t gdn_len;
const void * gdn_data;
if (use_subgroup_reduce && need_clustered_shader) {
if (use_clustered_reduce) {
gdn_len = gated_delta_net_f32_len;
gdn_data = (const void *)gated_delta_net_f32_data;
} else if (use_subgroup_reduce) {
@@ -5272,7 +5293,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
for (uint32_t kda = 0; kda < 2; kda++) {
ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda],
gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants),
wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size);
wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size);
}
}
}
@@ -10746,9 +10767,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0;
uint32_t si;
switch (S_v) {
case 32: si = 0; break;
case 64: si = 1; break;
case 128: si = 2; break;
case 16: si = 0; break;
case 32: si = 1; break;
case 64: si = 2; break;
case 128: si = 3; break;
default: return nullptr;
}
return ctx->device->pipeline_gated_delta_net[si][kda];
@@ -17193,7 +17215,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_GATED_DELTA_NET:
{
const uint32_t S_v = op->src[2]->ne[0];
if (S_v != 32 && S_v != 64 && S_v != 128) {
if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) {
return false;
}
for (int i = 0; i < 6; i++) {
+58 -45
View File
@@ -1088,6 +1088,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
if (w_s) {
res = ggml_mul(ctx0, res, w_s);
}
for (const auto & lora : *loras) {
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
if (lw == nullptr) {
@@ -1106,18 +1110,24 @@ ggml_tensor * llm_graph_context::build_lora_mm(
res = ggml_add(ctx0, res, ab_cur);
}
if (w_s) {
res = ggml_mul(ctx0, res, w_s);
}
return res;
}
ggml_tensor * llm_graph_context::build_lora_mm_id(
ggml_tensor * w, // ggml_tensor * as
ggml_tensor * cur, // ggml_tensor * b
ggml_tensor * ids) const {
ggml_tensor * ids,
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
if (w_s) {
const int64_t n_expert = w_s->ne[0];
const int64_t n_tokens = cur->ne[2];
ggml_tensor * s = ggml_reshape_3d(ctx0, w_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, ids);
res = ggml_mul(ctx0, res, s);
}
for (const auto & lora : *loras) {
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
if (lw == nullptr) {
@@ -1269,6 +1279,29 @@ ggml_tensor * llm_graph_context::build_ffn(
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
int il) const {
// NVFP4 support is currently restricted to
// 1) LORA absence (*_s would be applied after LORA residual, which is incorrect)
// 2) bias absense (*_s would be applied after bias addition, which is incorrect)
// TODO: disambiguate LLM-architectural scales (which use *_s) from NVFP4 scale_2 (which also uses *_s currently)
auto has_lora = [this](ggml_tensor * w) {
if (!w) {
return false;
}
for (const auto & lora : *loras) {
if (lora.first->get_weight(w) != nullptr) {
return true;
}
}
return false;
};
GGML_ASSERT(!up_s || !up_b || !up || up->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!gate_s || !gate_b || !gate || gate->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!down_s || !down_b || !down || down->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!up_s || !up || up->type != GGML_TYPE_NVFP4 || !has_lora(up));
GGML_ASSERT(!gate_s || !gate || gate->type != GGML_TYPE_NVFP4 || !has_lora(gate));
GGML_ASSERT(!down_s || !down || down->type != GGML_TYPE_NVFP4 || !has_lora(down));
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
cb(tmp, "ffn_up", il);
@@ -1627,23 +1660,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
if (gate_up_exps) {
// merged gate_up path: one mul_mat_id, then split into gate and up views
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens]
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts, up_exps_s); // [n_ff*2, n_expert_used, n_tokens]
cb(gate_up, "ffn_moe_gate_up", il);
if (up_exps_s) {
cb(gate_up, "ffn_moe_gate_up_scaled", il);
}
if (gate_up_exps_b) {
gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts);
cb(gate_up, "ffn_moe_gate_up_biased", il);
}
// apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
if (up_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
gate_up = ggml_mul(ctx0, gate_up, s);
cb(gate_up, "ffn_moe_gate_up_scaled", il);
}
const int64_t n_ff = gate_up->ne[0] / 2;
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
cb(cur, "ffn_moe_gate", il);
@@ -1651,43 +1679,33 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(up, "ffn_moe_up", il);
} else {
// separate gate and up path
up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
up = build_lora_mm_id(up_exps, cur, selected_experts, up_exps_s); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il);
if (up_exps_s) {
cb(up, "ffn_moe_up_scaled", il);
}
if (up_exps_b) {
up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
cb(up, "ffn_moe_up_biased", il);
}
// apply per-expert scale2 to up
if (up_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
up = ggml_mul(ctx0, up, s);
cb(up, "ffn_moe_up_scaled", il);
}
if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cur = build_lora_mm_id(gate_exps, cur, selected_experts, gate_exps_s); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
}
if (gate_exps_s) {
cb(cur, "ffn_moe_gate_scaled", il);
}
if (gate_exps_b) {
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
cb(cur, "ffn_moe_gate_biased", il);
}
// apply per-expert scale2 to gate
if (gate_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
cur = ggml_mul(ctx0, cur, s);
cb(cur, "ffn_moe_gate_scaled", il);
}
}
const bool has_gate = gate_exps || gate_up_exps;
@@ -1759,23 +1777,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error");
}
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
experts = build_lora_mm_id(down_exps, cur, selected_experts, down_exps_s); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);
if (down_exps_s) {
cb(experts, "ffn_moe_down_scaled", il);
}
if (down_exps_b) {
experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
cb(experts, "ffn_moe_down_biased", il);
}
// apply per-expert scale2 to down
if (down_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
experts = ggml_mul(ctx0, experts, s);
cb(experts, "ffn_moe_down_scaled", il);
}
if (!weight_before_ffn) {
experts = ggml_mul(ctx0, experts, weights);
cb(experts, "ffn_moe_weighted", il);
+3 -2
View File
@@ -853,11 +853,12 @@ struct llm_graph_context {
ggml_tensor * cur,
ggml_tensor * w_s = nullptr) const;
// do mat_mul_id, while optionally apply lora
// do mat_mul_id, while optionally apply lora and per-expert scale
ggml_tensor * build_lora_mm_id(
ggml_tensor * w, // ggml_tensor * as
ggml_tensor * cur, // ggml_tensor * b
ggml_tensor * ids) const;
ggml_tensor * ids,
ggml_tensor * w_s = nullptr) const;
ggml_tensor * build_norm(
ggml_tensor * cur,
+27 -3
View File
@@ -201,6 +201,8 @@ struct server_slot {
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model
std::vector<int32_t> n_accepted_per_pos; // Accepted tokens per draft position
void reset() {
SLT_DBG(*this, "%s", "\n");
@@ -227,6 +229,8 @@ struct server_slot {
// clear speculative decoding stats
n_draft_total = 0;
n_draft_accepted = 0;
n_draft_verif_steps = 0;
n_accepted_per_pos.clear();
task_prev = std::move(task);
task.reset();
@@ -509,10 +513,22 @@ struct server_slot {
llama_perf_context(ctx_tgt).n_reused);
if (n_draft_total > 0) {
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0;
std::string acceptance_rates_per_pos;
if (n_draft_verif_steps > 0) {
for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) {
if (i > 0) {
acceptance_rates_per_pos += ", ";
}
acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps);
}
}
SLT_INF(*this,
"draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
draft_ratio, n_draft_accepted, n_draft_total);
"draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n",
draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str());
}
common_speculative_print_stats(spec);
@@ -3543,6 +3559,14 @@ private:
// update how many tokens out of those tested were accepted
slot.n_draft_accepted += ids.size() - 1;
slot.n_draft_verif_steps += 1;
if (slot.n_accepted_per_pos.empty()) {
slot.n_accepted_per_pos.resize(common_speculative_n_max(&params_base.speculative), 0);
}
for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) {
slot.n_accepted_per_pos[i]++;
}
// add accepted tokens to the prompt
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
@@ -41,6 +41,7 @@
DATA_ERROR_HANDLED_ATTR,
BOOL_TRUE_STRING,
SETTINGS_KEYS,
CODE_BLOCK_HEADER_CLASS,
MERMAID_WRAPPER_CLASS,
MERMAID_BLOCK_CLASS,
MERMAID_LANGUAGE,
@@ -53,7 +54,11 @@
SVG_TAG_PREFIX,
SVG_SOURCE_ATTR,
SVG_RENDERED_ATTR,
SVG_INLINE_SHADOW_STYLE
SVG_INLINE_SHADOW_STYLE,
TOGGLE_SOURCE_BTN_CLASS,
DIAGRAM_VIEW_MODE_ATTR,
DIAGRAM_VIEW_RENDERED,
DIAGRAM_VIEW_SOURCE
} from '$lib/constants';
import { ColorMode, UrlProtocol } from '$lib/enums';
import { FileTypeText } from '$lib/enums/files.enums';
@@ -501,6 +506,23 @@
async function handleMermaidClick(event: MouseEvent) {
const target = event.target as HTMLElement;
// Toggle a diagram block between its rendered view and its source view.
// Shared by mermaid and svg, css drives the visibility from the wrapper mode.
const toggleBtn = target.closest(`.${TOGGLE_SOURCE_BTN_CLASS}`);
if (toggleBtn) {
event.preventDefault();
event.stopPropagation();
const wrapper = toggleBtn.closest(`.${MERMAID_WRAPPER_CLASS}, .${SVG_WRAPPER_CLASS}`);
if (!wrapper) return;
const isSource = wrapper.getAttribute(DIAGRAM_VIEW_MODE_ATTR) === DIAGRAM_VIEW_SOURCE;
const next = isSource ? DIAGRAM_VIEW_RENDERED : DIAGRAM_VIEW_SOURCE;
wrapper.setAttribute(DIAGRAM_VIEW_MODE_ATTR, next);
toggleBtn.setAttribute('aria-pressed', String(!isSource));
return;
}
// Check if clicking on copy or preview button in mermaid block
const copyBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .copy-code-btn`);
const previewBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .preview-code-btn`);
@@ -573,6 +595,11 @@
}
}
// A click on the header chrome targets the action buttons, never the
// diagram. Guard so a header click can not fall through to the click to
// zoom branches below, whatever the scroll position or stacking.
if (target.closest(`.${CODE_BLOCK_HEADER_CLASS}`)) return;
// Open preview when clicking the svg block itself. A final block carries its
// source, a streaming block does not and is mirrored live into the dialog.
const svgEl = target.closest(`.${SVG_BLOCK_CLASS}`);
@@ -300,7 +300,8 @@ div.markdown-user-content :global(.table-wrapper) {
}
.markdown-content :global(.copy-code-btn),
.markdown-content :global(.preview-code-btn) {
.markdown-content :global(.preview-code-btn),
.markdown-content :global(.toggle-source-btn) {
display: flex;
align-items: center;
justify-content: center;
@@ -312,15 +313,22 @@ div.markdown-user-content :global(.table-wrapper) {
}
.markdown-content :global(.copy-code-btn:hover),
.markdown-content :global(.preview-code-btn:hover) {
.markdown-content :global(.preview-code-btn:hover),
.markdown-content :global(.toggle-source-btn:hover) {
transform: scale(1.05);
}
.markdown-content :global(.copy-code-btn:active),
.markdown-content :global(.preview-code-btn:active) {
.markdown-content :global(.preview-code-btn:active),
.markdown-content :global(.toggle-source-btn:active) {
transform: scale(0.95);
}
/* Pressed state marks the source view as active */
.markdown-content :global(.toggle-source-btn[aria-pressed='true']) {
color: var(--primary);
}
.markdown-content :global(.code-block-wrapper pre) {
background: transparent;
margin: 0;
@@ -629,8 +637,8 @@ div.markdown-user-content :global(.table-wrapper) {
overflow-y: auto;
overflow-x: auto;
display: flex;
align-items: center;
justify-content: center;
align-items: safe center;
justify-content: safe center;
padding: 3rem 1rem 1rem;
}
@@ -645,7 +653,9 @@ div.markdown-user-content :global(.table-wrapper) {
overflow-y: visible;
}
/* Diagram block uses same header styling as code blocks */
/* Diagram block uses same header styling as code blocks. The header floats over
scrollable diagram content and stays transparent, so the overflow shows up to
the box edge. It keeps a z-index so it stays the click target above content. */
.markdown-content :global(.mermaid-block-wrapper .code-block-header),
.markdown-content :global(.svg-block-wrapper .code-block-header) {
display: flex;
@@ -657,6 +667,7 @@ div.markdown-user-content :global(.table-wrapper) {
top: 0;
left: 0;
right: 0;
z-index: 2;
}
.markdown-content :global(.mermaid-block-wrapper .code-block-actions),
@@ -683,6 +694,31 @@ div.markdown-user-content :global(.table-wrapper) {
padding: 3rem 1rem;
}
/* Source view stays hidden while the block renders, css swaps the two views
from the wrapper mode so the click handler only flips one attribute. The view
reuses the code block scroll container, so it matches the app code blocks. */
.markdown-content :global(.diagram-source) {
display: none;
text-align: left;
}
.markdown-content :global(.diagram-source pre) {
background: transparent;
margin: 0;
border-radius: 0;
border: none;
font-size: 0.875rem;
}
.markdown-content :global([data-view-mode='source'] .mermaid-scroll-container),
.markdown-content :global([data-view-mode='source'] .svg-scroll-container) {
display: none;
}
.markdown-content :global([data-view-mode='source'] .diagram-source) {
display: block;
}
/* Streaming mermaid block - empty preview box */
.mermaid-streaming-block {
min-height: 300px;
@@ -7,12 +7,16 @@ import type { Element, ElementContent } from 'hast';
import {
CODE_BLOCK_HEADER_CLASS,
CODE_BLOCK_ACTIONS_CLASS,
CODE_BLOCK_SCROLL_CONTAINER_CLASS,
CODE_LANGUAGE_CLASS,
COPY_CODE_BTN_CLASS,
PREVIEW_CODE_BTN_CLASS,
TOGGLE_SOURCE_BTN_CLASS,
DIAGRAM_SOURCE_CLASS,
RELATIVE_CLASS,
COPY_ICON_SVG,
PREVIEW_ICON_SVG
PREVIEW_ICON_SVG,
CODE_ICON_SVG
} from '$lib/constants';
export interface BlockIdGenerator {
@@ -32,14 +36,16 @@ export function createIconElement(svg: string): Element {
}
/**
* Creates a button element with icon.
* Creates a button element with icon. Extra properties merge onto the button,
* which lets a stateful button carry attributes like aria-pressed.
*/
export function createButton(
className: string,
title: string,
iconSvg: string,
id: string,
idAttribute: string
idAttribute: string,
extraProperties: Record<string, string> = {}
): Element {
return {
type: 'element',
@@ -48,7 +54,8 @@ export function createButton(
className: [className],
[idAttribute]: id,
title,
type: 'button'
type: 'button',
...extraProperties
},
children: [createIconElement(iconSvg)]
};
@@ -72,6 +79,52 @@ export function createPreviewButton(
return createButton(PREVIEW_CODE_BTN_CLASS, title, PREVIEW_ICON_SVG, id, idAttribute);
}
/**
* Creates a button that toggles a diagram block between its rendered view and
* its source view. aria-pressed starts false, the rendered view is the default.
*/
export function createToggleSourceButton(
id: string,
idAttribute: string,
title: string = 'Toggle source'
): Element {
return createButton(TOGGLE_SOURCE_BTN_CLASS, title, CODE_ICON_SVG, id, idAttribute, {
'aria-pressed': 'false'
});
}
/**
* Creates a source view for a diagram block. It reuses the code block scroll
* container so it matches the app code blocks, and wraps the highlighted code
* element captured at transform time. A missing code element falls back to a
* plain code node built from the raw source.
*/
export function createSourceView(
codeElement: Element | undefined,
source: string,
language: string
): Element {
const code: Element = codeElement ?? {
type: 'element',
tagName: 'code',
properties: { className: ['hljs', `language-${language}`] },
children: [{ type: 'text', value: source }]
};
return {
type: 'element',
tagName: 'div',
properties: { className: [DIAGRAM_SOURCE_CLASS, CODE_BLOCK_SCROLL_CONTAINER_CLASS] },
children: [
{
type: 'element',
tagName: 'pre',
properties: {},
children: [code]
}
]
};
}
/**
* Creates a block header with language label and action buttons.
*/
@@ -116,14 +169,17 @@ export function createScrollContainer(preElement: Element, scrollContainerClass:
}
/**
* Creates a wrapper element with header and scroll container.
* Creates a wrapper element with header and scroll container. Extra children
* append after the scroll container, which lets a block carry a source view
* alongside its rendered output.
*/
export function createWrapper(
header: Element,
preElement: Element,
wrapperClass: string,
scrollContainerClass: string,
additionalAttributes?: Record<string, string>
additionalAttributes?: Record<string, string>,
extraChildren: Element[] = []
): Element {
return {
type: 'element',
@@ -132,7 +188,7 @@ export function createWrapper(
className: [wrapperClass, RELATIVE_CLASS],
...additionalAttributes
} as Element['properties'],
children: [header, createScrollContainer(preElement, scrollContainerClass)]
children: [header, createScrollContainer(preElement, scrollContainerClass), ...extraChildren]
};
}
@@ -19,12 +19,17 @@ import {
MERMAID_BLOCK_CLASS,
MERMAID_LANGUAGE,
MERMAID_SYNTAX_ATTR,
MERMAID_ID_ATTR
MERMAID_ID_ATTR,
DIAGRAM_VIEW_MODE_ATTR,
DIAGRAM_VIEW_RENDERED
} from '$lib/constants';
import type { DiagramPreData } from './pre-transform';
import {
createBlockHeader,
createCopyButton,
createPreviewButton,
createToggleSourceButton,
createSourceView,
createWrapper,
generateBlockId
} from './code-block-utils';
@@ -75,16 +80,23 @@ export const rehypeEnhanceMermaidBlocks: Plugin<[], Root> = () => {
const actions = [
createCopyButton(mermaidId, MERMAID_ID_ATTR, 'Copy mermaid syntax'),
createToggleSourceButton(mermaidId, MERMAID_ID_ATTR, 'Toggle mermaid source'),
createPreviewButton(mermaidId, MERMAID_ID_ATTR, 'Preview diagram')
];
const header = createBlockHeader(MERMAID_LANGUAGE, mermaidId, MERMAID_ID_ATTR, actions);
const preservedCode = (node.data as DiagramPreData | undefined)?.sourceCode;
const sourceView = createSourceView(preservedCode, diagramText, MERMAID_LANGUAGE);
const wrapper = createWrapper(
header,
node,
MERMAID_WRAPPER_CLASS,
MERMAID_SCROLL_CONTAINER_CLASS,
{ [MERMAID_ID_ATTR]: mermaidId }
{
[MERMAID_ID_ATTR]: mermaidId,
[DIAGRAM_VIEW_MODE_ATTR]: DIAGRAM_VIEW_RENDERED
},
[sourceView]
);
// Replace pre with wrapper in parent
@@ -18,12 +18,17 @@ import {
SVG_BLOCK_CLASS,
SVG_LANGUAGE,
SVG_SOURCE_ATTR,
SVG_ID_ATTR
SVG_ID_ATTR,
DIAGRAM_VIEW_MODE_ATTR,
DIAGRAM_VIEW_RENDERED
} from '$lib/constants';
import type { DiagramPreData } from './pre-transform';
import {
createBlockHeader,
createCopyButton,
createPreviewButton,
createToggleSourceButton,
createSourceView,
createWrapper,
generateBlockId
} from './code-block-utils';
@@ -65,13 +70,24 @@ export const rehypeEnhanceSvgBlocks: Plugin<[], Root> = () => {
const actions = [
createCopyButton(svgId, SVG_ID_ATTR, 'Copy svg source'),
createToggleSourceButton(svgId, SVG_ID_ATTR, 'Toggle svg source'),
createPreviewButton(svgId, SVG_ID_ATTR, 'Preview svg')
];
const header = createBlockHeader(SVG_LANGUAGE, svgId, SVG_ID_ATTR, actions);
const wrapper = createWrapper(header, node, SVG_WRAPPER_CLASS, SVG_SCROLL_CONTAINER_CLASS, {
[SVG_ID_ATTR]: svgId
});
const preservedCode = (node.data as DiagramPreData | undefined)?.sourceCode;
const sourceView = createSourceView(preservedCode, svgSource, SVG_LANGUAGE);
const wrapper = createWrapper(
header,
node,
SVG_WRAPPER_CLASS,
SVG_SCROLL_CONTAINER_CLASS,
{
[SVG_ID_ATTR]: svgId,
[DIAGRAM_VIEW_MODE_ATTR]: DIAGRAM_VIEW_RENDERED
},
[sourceView]
);
// Replace pre with wrapper in parent
(parent.children as ElementContent[])[index] = wrapper;
@@ -2,6 +2,15 @@ import type { Plugin } from 'unified';
import type { Root, Element, ElementContent, Text } from 'hast';
import { visit } from 'unist-util-visit';
/**
* Metadata a diagram pre carries on its unist data field. The source code holds
* the highlighted code element captured before the pre became a render target,
* which the enhancer reuses to build a matching source view.
*/
export interface DiagramPreData {
sourceCode: Element;
}
/**
* Recursively extracts all text content from a HAST node.
* Handles nested elements (e.g., span wrappers from syntax highlighting).
@@ -69,7 +78,10 @@ export function createPreTransform(
properties: {
className: [targetClass]
},
children: [{ type: 'text', value: text } as Text]
children: [{ type: 'text', value: text } as Text],
// Keep the highlighted code element so the block can offer a source
// view that matches the app code blocks without re highlighting.
data: { sourceCode: codeElement } satisfies DiagramPreData
};
(parent.children as ElementContent[])[index] = pre;
@@ -0,0 +1,9 @@
// Shared constants for diagram blocks (mermaid and svg) that toggle between a
// rendered view and a source view. The wrapper carries the active mode, css
// drives the visibility, the click handler only flips the attribute.
export const DIAGRAM_VIEW_MODE_ATTR = 'data-view-mode';
export const DIAGRAM_VIEW_RENDERED = 'rendered';
export const DIAGRAM_VIEW_SOURCE = 'source';
export const DIAGRAM_SOURCE_CLASS = 'diagram-source';
export const TOGGLE_SOURCE_BTN_CLASS = 'toggle-source-btn';
+2
View File
@@ -39,3 +39,5 @@ export const MODALITY_LABELS = {
export const COPY_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-copy-icon lucide-copy"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>`;
export const PREVIEW_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-eye lucide-eye-icon"><path d="M2.062 12.345a1 1 0 0 1 0-.69C3.5 7.73 7.36 5 12 5s8.5 2.73 9.938 6.655a1 1 0 0 1 0 .69C20.5 16.27 16.64 19 12 19s-8.5-2.73-9.938-6.655"/><circle cx="12" cy="12" r="3"/></svg>`;
export const CODE_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-code lucide-code-icon"><path d="m16 18 6-6-6-6"/><path d="m8 6-6 6 6 6"/></svg>`;
+1
View File
@@ -30,6 +30,7 @@ export * from './literal-html';
export * from './markdown';
export * from './mermaid-blocks';
export * from './svg-blocks';
export * from './diagram-blocks';
export * from './max-bundle-size';
export * from './mcp';
export * from './mcp-form';
+1 -1
View File
@@ -41,7 +41,7 @@ if (LLAMA_BUILD_BORINGSSL)
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
set(BORINGSSL_VERSION "0.20260526.0" CACHE STRING "BoringSSL version")
set(BORINGSSL_VERSION "0.20260616.0" CACHE STRING "BoringSSL version")
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")