vendor : update BoringSSL to 0.20260616.0 (#24693 )

ui: add source toggle to mermaid and svg blocks (#24652 )
* ui: add source toggle to mermaid and svg blocks Add a toggle button next to copy and preview that switches a rendered mermaid or svg block to its source code and back. The button is shared by both block types and the rendered view stays the default. The source view reuses the code block scroll container and the highlighted code element captured at transform time, so it matches the app code blocks without highlighting again. Make tall diagrams scroll like text code blocks: safe centering keeps the diagram centered when it fits and falls back to start alignment when it overflows, so the top stays reachable instead of clipping above. Keep the block header opaque and layered above the scrolled diagram, and ignore header clicks in the zoom handler, so a button click never falls through to the zoom dialog. * ui: transparent diagram block header, address review from @allozaur
2026-06-17 02:57:39 +02:00 · 2026-06-16 20:24:28 +02:00 · 2026-06-16 14:14:22 +02:00 · 2026-06-16 11:52:38 +02:00 · 2026-06-16 12:05:52 +03:00 · 2026-06-16 09:36:52 +02:00
15 changed files with 379 additions and 88 deletions
@@ -140,6 +140,8 @@ struct common_speculative_impl {
    size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
    size_t n_acc_tokens = 0; // number of tokens accepted by the target model.

+    std::vector<size_t> n_acc_tokens_per_pos; // number of tokens accepted per draft position.
+
    // TODO: track performance of most recent calls
    const bool gen_perf = true; // whether to generate performance stats.

@@ -416,6 +418,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {

    std::vector<common_sampler_ptr> smpls;

+    // backend sampler chain per seq, attached to ctx_dft
+    std::vector<llama_sampler *> backend_chains;
+
    int32_t n_embd_dec = 0;       // draft hidden size
    int32_t n_embd_enc = 0;       // target_layer_ids_n * target_hidden_size
    int32_t n_embd_tgt = 0;       // target model hidden size
@@ -441,7 +446,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
        , params(params.draft)
    {
        LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
-        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
+        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f, backend_sampling=%d\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min, (int) params.draft.backend_sampling);

        auto * ctx_tgt = this->params.ctx_tgt;
        auto * ctx_dft = this->params.ctx_dft;
@@ -476,6 +481,22 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
            s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
        }

+        // offload draft sampling to the backend
+        backend_chains.assign(n_seq, nullptr);
+        if (this->params.backend_sampling) {
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
+                llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
+
+                if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
+                    LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
+                    llama_sampler_free(chain);
+                    chain = nullptr;
+                }
+                backend_chains[seq_id] = chain;
+            }
+        }
+
        // turn on extraction of the target layers' input embeddings
        for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
            llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
@@ -494,6 +515,18 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
    }

    ~common_speculative_impl_draft_eagle3() override {
+        auto * ctx_dft = this->params.ctx_dft;
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
+            if (backend_chains[seq_id] == nullptr) {
+                continue;
+            }
+            if (ctx_dft) {
+                llama_set_sampler(ctx_dft, seq_id, nullptr);
+            }
+            llama_sampler_free(backend_chains[seq_id]);
+        }
+        backend_chains.clear();
+
        if (batch.token != nullptr) {
            free(batch.token);
            batch.token = nullptr;
@@ -2059,6 +2092,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u

    {
        common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
+
+        if (impl->n_acc_tokens_per_pos.size() < n_accepted) {
+            impl->n_acc_tokens_per_pos.resize(n_accepted, 0);
+        }
+
+        for (size_t i = 0; i < n_accepted; ++i) {
+            impl->n_acc_tokens_per_pos[i]++;
+        }
+
        if (n_accepted > 0) {
            impl->n_acc_drafts++;
            impl->n_acc_tokens += n_accepted;
@@ -2093,13 +2135,31 @@ void common_speculative_print_stats(const common_speculative * spec) {
            str_perf = "";
        }

-        LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
+        std::string str_stats;
+        if (impl->n_call_accept > 0) {
+            const double mean =
+                1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept;
+            std::ostringstream tmp;
+            tmp << std::fixed << std::setprecision(3);
+            for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) {
+                if (i > 0) {
+                    tmp << ", ";
+                }
+                tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept;
+            }
+            std::ostringstream oss;
+            oss << std::fixed << std::setprecision(2) << mean;
+            str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
+        }
+
+        LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
                common_speculative_type_to_str(impl->type).c_str(),
                impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
                impl->n_gen_drafts,
                impl->n_acc_drafts,
                impl->n_gen_tokens,
                impl->n_acc_tokens,
+                str_stats.c_str(),
                str_perf.c_str());
    }
 }
@@ -911,8 +911,8 @@ struct vk_device_struct {
    vk_pipeline pipeline_pool2d_f32;
    vk_pipeline pipeline_rwkv_wkv6_f32;
    vk_pipeline pipeline_rwkv_wkv7_f32;
-    // [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128
-    vk_pipeline pipeline_gated_delta_net[3][2];
+    // [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128
+    vk_pipeline pipeline_gated_delta_net[4][2];
    vk_pipeline pipeline_ssm_scan_f32_d128;
    vk_pipeline pipeline_ssm_scan_f32_d256;
    vk_pipeline pipeline_ssm_conv_f32;
@@ -3080,8 +3080,10 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
                                                       vk::MemoryPropertyFlagBits::eDeviceLocal});
        } else if (device->uma) {
-            // Fall back to host memory type
-            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
+            // On UMA, prefer host-visible memory so direct tensor borrowing works.
+            // If unavailable, fall back to device-local memory.
+            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
+                                                       vk::MemoryPropertyFlagBits::eDeviceLocal,
                                                       vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
        } else if (device->disable_host_visible_vidmem) {
            if (device->allow_sysmem_fallback) {
@@ -5231,14 +5233,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
    ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);

    {
-        const uint32_t gdn_sizes[] = {32, 64, 128};
+        const uint32_t gdn_sizes[] = {16, 32, 64, 128};
        const char * gdn_names[][2] = {
+            {"gated_delta_net_f32_d16",     "gated_delta_net_f32_d16_kda"},
            {"gated_delta_net_f32_d32",     "gated_delta_net_f32_d32_kda"},
            {"gated_delta_net_f32_d64",     "gated_delta_net_f32_d64_kda"},
            {"gated_delta_net_f32_d128",    "gated_delta_net_f32_d128_kda"},
        };
-        const bool use_subgroup_reduce = device->subgroup_arithmetic;
-        for (uint32_t si = 0; si < 3; si++) {
+        for (uint32_t si = 0; si < 4; si++) {
            const uint32_t S_V = gdn_sizes[si];
            GGML_ASSERT(is_pow2(S_V));

@@ -5252,10 +5254,29 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
                lanes_per_column = std::min(S_V, device->subgroup_size);
            }

-            const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size);
+            // gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and
+            // S_V % LANES_PER_COLUMN == 0 to avoid bounds checks.
+            while (lanes_per_column > 1u) {
+                const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 &&
+                                         (S_V % lanes_per_column) == 0;
+                const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0;
+                if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) {
+                    break;
+                }
+                lanes_per_column >>= 1u;
+            }
+
+            GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0);
+            GGML_ASSERT((S_V % lanes_per_column) == 0);
+            GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0);
+
+            const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size;
+            const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce;
+            const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce;
+            const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce;
            size_t gdn_len;
            const void * gdn_data;
-            if (use_subgroup_reduce && need_clustered_shader) {
+            if (use_clustered_reduce) {
                gdn_len = gated_delta_net_f32_len;
                gdn_data = (const void *)gated_delta_net_f32_data;
            } else if (use_subgroup_reduce) {
@@ -5272,7 +5293,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
            for (uint32_t kda = 0; kda < 2; kda++) {
                ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda],
                    gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants),
-                    wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size);
+                    wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size);
            }
        }
    }
@@ -10746,9 +10767,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0;
            uint32_t si;
            switch (S_v) {
-                case 32:  si = 0; break;
-                case 64:  si = 1; break;
-                case 128: si = 2; break;
+                case 16:  si = 0; break;
+                case 32:  si = 1; break;
+                case 64:  si = 2; break;
+                case 128: si = 3; break;
                default: return nullptr;
            }
            return ctx->device->pipeline_gated_delta_net[si][kda];
@@ -17193,7 +17215,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_GATED_DELTA_NET:
            {
                const uint32_t S_v = op->src[2]->ne[0];
-                if (S_v != 32 && S_v != 64 && S_v != 128) {
+                if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) {
                    return false;
                }
                for (int i = 0; i < 6; i++) {
@@ -1088,6 +1088,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
          ggml_tensor * w_s) const {
    ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);

+    if (w_s) {
+        res = ggml_mul(ctx0, res, w_s);
+    }
+
    for (const auto & lora : *loras) {
        llama_adapter_lora_weight * lw = lora.first->get_weight(w);
        if (lw == nullptr) {
@@ -1106,18 +1110,24 @@ ggml_tensor * llm_graph_context::build_lora_mm(
        res = ggml_add(ctx0, res, ab_cur);
    }

-    if (w_s) {
-        res = ggml_mul(ctx0, res, w_s);
-    }
-
    return res;
 }

 ggml_tensor * llm_graph_context::build_lora_mm_id(
          ggml_tensor * w,   // ggml_tensor * as
          ggml_tensor * cur, // ggml_tensor * b
-          ggml_tensor * ids) const {
+          ggml_tensor * ids,
+          ggml_tensor * w_s) const {
    ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+
+    if (w_s) {
+        const int64_t n_expert = w_s->ne[0];
+        const int64_t n_tokens = cur->ne[2];
+        ggml_tensor * s = ggml_reshape_3d(ctx0, w_s, 1, n_expert, 1);
+        s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+        s = ggml_get_rows(ctx0, s, ids);
+        res = ggml_mul(ctx0, res, s);
+    }
    for (const auto & lora : *loras) {
        llama_adapter_lora_weight * lw = lora.first->get_weight(w);
        if (lw == nullptr) {
@@ -1269,6 +1279,29 @@ ggml_tensor * llm_graph_context::build_ffn(
     llm_ffn_op_type   type_op,
   llm_ffn_gate_type   type_gate,
                 int   il) const {
+    // NVFP4 support is currently restricted to
+    // 1) LORA absence (*_s would be applied after LORA residual, which is incorrect)
+    // 2) bias absense (*_s would be applied after bias addition, which is incorrect)
+    // TODO: disambiguate LLM-architectural scales (which use *_s) from NVFP4 scale_2 (which also uses *_s currently)
+    auto has_lora = [this](ggml_tensor * w) {
+        if (!w) {
+            return false;
+        }
+        for (const auto & lora : *loras) {
+            if (lora.first->get_weight(w) != nullptr) {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    GGML_ASSERT(!up_s   || !up_b   || !up   || up->type   != GGML_TYPE_NVFP4);
+    GGML_ASSERT(!gate_s || !gate_b || !gate || gate->type != GGML_TYPE_NVFP4);
+    GGML_ASSERT(!down_s || !down_b || !down || down->type != GGML_TYPE_NVFP4);
+    GGML_ASSERT(!up_s   || !up   || up->type   != GGML_TYPE_NVFP4 || !has_lora(up));
+    GGML_ASSERT(!gate_s || !gate || gate->type != GGML_TYPE_NVFP4 || !has_lora(gate));
+    GGML_ASSERT(!down_s || !down || down->type != GGML_TYPE_NVFP4 || !has_lora(down));
+
    ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
    cb(tmp, "ffn_up", il);

@@ -1627,23 +1660,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(

    if (gate_up_exps) {
        // merged gate_up path: one mul_mat_id, then split into gate and up views
-        ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens]
+        ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts, up_exps_s); // [n_ff*2, n_expert_used, n_tokens]
        cb(gate_up, "ffn_moe_gate_up", il);

+        if (up_exps_s) {
+            cb(gate_up, "ffn_moe_gate_up_scaled", il);
+        }
+
        if (gate_up_exps_b) {
            gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts);
            cb(gate_up, "ffn_moe_gate_up_biased", il);
        }

-        // apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
-        if (up_exps_s) {
-            ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
-            s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
-            s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
-            gate_up = ggml_mul(ctx0, gate_up, s);
-            cb(gate_up, "ffn_moe_gate_up_scaled", il);
-        }
-
        const int64_t n_ff = gate_up->ne[0] / 2;
        cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
        cb(cur, "ffn_moe_gate", il);
@@ -1651,43 +1679,33 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        cb(up, "ffn_moe_up", il);
    } else {
        // separate gate and up path
-        up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        up = build_lora_mm_id(up_exps, cur, selected_experts, up_exps_s); // [n_ff, n_expert_used, n_tokens]
        cb(up, "ffn_moe_up", il);

+        if (up_exps_s) {
+            cb(up, "ffn_moe_up_scaled", il);
+        }
+
        if (up_exps_b) {
            up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
            cb(up, "ffn_moe_up_biased", il);
        }

-        // apply per-expert scale2 to up
-        if (up_exps_s) {
-            ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
-            s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
-            s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
-            up = ggml_mul(ctx0, up, s);
-            cb(up, "ffn_moe_up_scaled", il);
-        }
-
        if (gate_exps) {
-            cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+            cur = build_lora_mm_id(gate_exps, cur, selected_experts, gate_exps_s); // [n_ff, n_expert_used, n_tokens]
            cb(cur, "ffn_moe_gate", il);
        } else {
            cur = up;
        }

+        if (gate_exps_s) {
+            cb(cur, "ffn_moe_gate_scaled", il);
+        }
+
        if (gate_exps_b) {
            cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
            cb(cur, "ffn_moe_gate_biased", il);
        }
-
-        // apply per-expert scale2 to gate
-        if (gate_exps_s) {
-            ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
-            s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
-            s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
-            cur = ggml_mul(ctx0, cur, s);
-            cb(cur, "ffn_moe_gate_scaled", il);
-        }
    }

    const bool has_gate = gate_exps || gate_up_exps;
@@ -1759,23 +1777,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
            GGML_ABORT("fatal error");
    }

-    experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
+    experts = build_lora_mm_id(down_exps, cur, selected_experts, down_exps_s); // [n_embd, n_expert_used, n_tokens]
    cb(experts, "ffn_moe_down", il);

+    if (down_exps_s) {
+        cb(experts, "ffn_moe_down_scaled", il);
+    }
+
    if (down_exps_b) {
        experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
        cb(experts, "ffn_moe_down_biased", il);
    }

-    // apply per-expert scale2 to down
-    if (down_exps_s) {
-        ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
-        s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
-        s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
-        experts = ggml_mul(ctx0, experts, s);
-        cb(experts, "ffn_moe_down_scaled", il);
-    }
-
    if (!weight_before_ffn) {
        experts = ggml_mul(ctx0, experts, weights);
        cb(experts, "ffn_moe_weighted", il);
@@ -853,11 +853,12 @@ struct llm_graph_context {
              ggml_tensor * cur,
              ggml_tensor * w_s = nullptr) const;

-    // do mat_mul_id, while optionally apply lora
+    // do mat_mul_id, while optionally apply lora and per-expert scale
    ggml_tensor * build_lora_mm_id(
              ggml_tensor * w,   // ggml_tensor * as
              ggml_tensor * cur, // ggml_tensor * b
-              ggml_tensor * ids) const;
+              ggml_tensor * ids,
+              ggml_tensor * w_s = nullptr) const;

    ggml_tensor * build_norm(
             ggml_tensor * cur,
@@ -201,6 +201,8 @@ struct server_slot {
    // Speculative decoding stats
    int32_t n_draft_total = 0;      // Total draft tokens generated
    int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
+    int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model
+    std::vector<int32_t> n_accepted_per_pos; // Accepted tokens per draft position

    void reset() {
        SLT_DBG(*this, "%s", "\n");
@@ -227,6 +229,8 @@ struct server_slot {
        // clear speculative decoding stats
        n_draft_total = 0;
        n_draft_accepted = 0;
+        n_draft_verif_steps = 0;
+        n_accepted_per_pos.clear();

        task_prev = std::move(task);
        task.reset();
@@ -509,10 +513,22 @@ struct server_slot {
                llama_perf_context(ctx_tgt).n_reused);

        if (n_draft_total > 0) {
-            const float draft_ratio = (float) n_draft_accepted / n_draft_total;
+            const float  draft_ratio  = (float) n_draft_accepted / n_draft_total;
+            const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0;
+
+            std::string acceptance_rates_per_pos;
+            if (n_draft_verif_steps > 0) {
+                for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) {
+                    if (i > 0) {
+                        acceptance_rates_per_pos += ", ";
+                    }
+                    acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps);
+                }
+            }
+
            SLT_INF(*this,
-                    "draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
-                    draft_ratio, n_draft_accepted, n_draft_total);
+                    "draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n",
+                    draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str());
        }

        common_speculative_print_stats(spec);
@@ -3543,6 +3559,14 @@ private:

                // update how many tokens out of those tested were accepted
                slot.n_draft_accepted += ids.size() - 1;
+                slot.n_draft_verif_steps += 1;
+
+                if (slot.n_accepted_per_pos.empty()) {
+                    slot.n_accepted_per_pos.resize(common_speculative_n_max(&params_base.speculative), 0);
+                }
+                for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) {
+                    slot.n_accepted_per_pos[i]++;
+                }

                // add accepted tokens to the prompt
                slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
@@ -41,6 +41,7 @@
 		DATA_ERROR_HANDLED_ATTR,
 		BOOL_TRUE_STRING,
 		SETTINGS_KEYS,
+		CODE_BLOCK_HEADER_CLASS,
 		MERMAID_WRAPPER_CLASS,
 		MERMAID_BLOCK_CLASS,
 		MERMAID_LANGUAGE,
@@ -53,7 +54,11 @@
 		SVG_TAG_PREFIX,
 		SVG_SOURCE_ATTR,
 		SVG_RENDERED_ATTR,
-		SVG_INLINE_SHADOW_STYLE
+		SVG_INLINE_SHADOW_STYLE,
+		TOGGLE_SOURCE_BTN_CLASS,
+		DIAGRAM_VIEW_MODE_ATTR,
+		DIAGRAM_VIEW_RENDERED,
+		DIAGRAM_VIEW_SOURCE
 	} from '$lib/constants';
 	import { ColorMode, UrlProtocol } from '$lib/enums';
 	import { FileTypeText } from '$lib/enums/files.enums';
@@ -501,6 +506,23 @@
 	async function handleMermaidClick(event: MouseEvent) {
 		const target = event.target as HTMLElement;

+		// Toggle a diagram block between its rendered view and its source view.
+		// Shared by mermaid and svg, css drives the visibility from the wrapper mode.
+		const toggleBtn = target.closest(`.${TOGGLE_SOURCE_BTN_CLASS}`);
+		if (toggleBtn) {
+			event.preventDefault();
+			event.stopPropagation();
+
+			const wrapper = toggleBtn.closest(`.${MERMAID_WRAPPER_CLASS}, .${SVG_WRAPPER_CLASS}`);
+			if (!wrapper) return;
+
+			const isSource = wrapper.getAttribute(DIAGRAM_VIEW_MODE_ATTR) === DIAGRAM_VIEW_SOURCE;
+			const next = isSource ? DIAGRAM_VIEW_RENDERED : DIAGRAM_VIEW_SOURCE;
+			wrapper.setAttribute(DIAGRAM_VIEW_MODE_ATTR, next);
+			toggleBtn.setAttribute('aria-pressed', String(!isSource));
+			return;
+		}
+
 		// Check if clicking on copy or preview button in mermaid block
 		const copyBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .copy-code-btn`);
 		const previewBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .preview-code-btn`);
@@ -573,6 +595,11 @@
 			}
 		}

+		// A click on the header chrome targets the action buttons, never the
+		// diagram. Guard so a header click can not fall through to the click to
+		// zoom branches below, whatever the scroll position or stacking.
+		if (target.closest(`.${CODE_BLOCK_HEADER_CLASS}`)) return;
+
 		// Open preview when clicking the svg block itself. A final block carries its
 		// source, a streaming block does not and is mirrored live into the dialog.
 		const svgEl = target.closest(`.${SVG_BLOCK_CLASS}`);
@@ -300,7 +300,8 @@ div.markdown-user-content :global(.table-wrapper) {
 }

 .markdown-content :global(.copy-code-btn),
-.markdown-content :global(.preview-code-btn) {
+.markdown-content :global(.preview-code-btn),
+.markdown-content :global(.toggle-source-btn) {
 	display: flex;
 	align-items: center;
 	justify-content: center;
@@ -312,15 +313,22 @@ div.markdown-user-content :global(.table-wrapper) {
 }

 .markdown-content :global(.copy-code-btn:hover),
-.markdown-content :global(.preview-code-btn:hover) {
+.markdown-content :global(.preview-code-btn:hover),
+.markdown-content :global(.toggle-source-btn:hover) {
 	transform: scale(1.05);
 }

 .markdown-content :global(.copy-code-btn:active),
-.markdown-content :global(.preview-code-btn:active) {
+.markdown-content :global(.preview-code-btn:active),
+.markdown-content :global(.toggle-source-btn:active) {
 	transform: scale(0.95);
 }

+/* Pressed state marks the source view as active */
+.markdown-content :global(.toggle-source-btn[aria-pressed='true']) {
+	color: var(--primary);
+}
+
 .markdown-content :global(.code-block-wrapper pre) {
 	background: transparent;
 	margin: 0;
@@ -629,8 +637,8 @@ div.markdown-user-content :global(.table-wrapper) {
 	overflow-y: auto;
 	overflow-x: auto;
 	display: flex;
-	align-items: center;
-	justify-content: center;
+	align-items: safe center;
+	justify-content: safe center;
 	padding: 3rem 1rem 1rem;
 }

@@ -645,7 +653,9 @@ div.markdown-user-content :global(.table-wrapper) {
 	overflow-y: visible;
 }

-/* Diagram block uses same header styling as code blocks */
+/* Diagram block uses same header styling as code blocks. The header floats over
+   scrollable diagram content and stays transparent, so the overflow shows up to
+   the box edge. It keeps a z-index so it stays the click target above content. */
 .markdown-content :global(.mermaid-block-wrapper .code-block-header),
 .markdown-content :global(.svg-block-wrapper .code-block-header) {
 	display: flex;
@@ -657,6 +667,7 @@ div.markdown-user-content :global(.table-wrapper) {
 	top: 0;
 	left: 0;
 	right: 0;
+	z-index: 2;
 }

 .markdown-content :global(.mermaid-block-wrapper .code-block-actions),
@@ -683,6 +694,31 @@ div.markdown-user-content :global(.table-wrapper) {
 	padding: 3rem 1rem;
 }

+/* Source view stays hidden while the block renders, css swaps the two views
+   from the wrapper mode so the click handler only flips one attribute. The view
+   reuses the code block scroll container, so it matches the app code blocks. */
+.markdown-content :global(.diagram-source) {
+	display: none;
+	text-align: left;
+}
+
+.markdown-content :global(.diagram-source pre) {
+	background: transparent;
+	margin: 0;
+	border-radius: 0;
+	border: none;
+	font-size: 0.875rem;
+}
+
+.markdown-content :global([data-view-mode='source'] .mermaid-scroll-container),
+.markdown-content :global([data-view-mode='source'] .svg-scroll-container) {
+	display: none;
+}
+
+.markdown-content :global([data-view-mode='source'] .diagram-source) {
+	display: block;
+}
+
 /* Streaming mermaid block - empty preview box */
 .mermaid-streaming-block {
 	min-height: 300px;
@@ -7,12 +7,16 @@ import type { Element, ElementContent } from 'hast';
 import {
 	CODE_BLOCK_HEADER_CLASS,
 	CODE_BLOCK_ACTIONS_CLASS,
+	CODE_BLOCK_SCROLL_CONTAINER_CLASS,
 	CODE_LANGUAGE_CLASS,
 	COPY_CODE_BTN_CLASS,
 	PREVIEW_CODE_BTN_CLASS,
+	TOGGLE_SOURCE_BTN_CLASS,
+	DIAGRAM_SOURCE_CLASS,
 	RELATIVE_CLASS,
 	COPY_ICON_SVG,
-	PREVIEW_ICON_SVG
+	PREVIEW_ICON_SVG,
+	CODE_ICON_SVG
 } from '$lib/constants';

 export interface BlockIdGenerator {
@@ -32,14 +36,16 @@ export function createIconElement(svg: string): Element {
 }

 /**
- * Creates a button element with icon.
+ * Creates a button element with icon. Extra properties merge onto the button,
+ * which lets a stateful button carry attributes like aria-pressed.
 */
 export function createButton(
 	className: string,
 	title: string,
 	iconSvg: string,
 	id: string,
-	idAttribute: string
+	idAttribute: string,
+	extraProperties: Record<string, string> = {}
 ): Element {
 	return {
 		type: 'element',
@@ -48,7 +54,8 @@ export function createButton(
 			className: [className],
 			[idAttribute]: id,
 			title,
-			type: 'button'
+			type: 'button',
+			...extraProperties
 		},
 		children: [createIconElement(iconSvg)]
 	};
@@ -72,6 +79,52 @@ export function createPreviewButton(
 	return createButton(PREVIEW_CODE_BTN_CLASS, title, PREVIEW_ICON_SVG, id, idAttribute);
 }

+/**
+ * Creates a button that toggles a diagram block between its rendered view and
+ * its source view. aria-pressed starts false, the rendered view is the default.
+ */
+export function createToggleSourceButton(
+	id: string,
+	idAttribute: string,
+	title: string = 'Toggle source'
+): Element {
+	return createButton(TOGGLE_SOURCE_BTN_CLASS, title, CODE_ICON_SVG, id, idAttribute, {
+		'aria-pressed': 'false'
+	});
+}
+
+/**
+ * Creates a source view for a diagram block. It reuses the code block scroll
+ * container so it matches the app code blocks, and wraps the highlighted code
+ * element captured at transform time. A missing code element falls back to a
+ * plain code node built from the raw source.
+ */
+export function createSourceView(
+	codeElement: Element | undefined,
+	source: string,
+	language: string
+): Element {
+	const code: Element = codeElement ?? {
+		type: 'element',
+		tagName: 'code',
+		properties: { className: ['hljs', `language-${language}`] },
+		children: [{ type: 'text', value: source }]
+	};
+	return {
+		type: 'element',
+		tagName: 'div',
+		properties: { className: [DIAGRAM_SOURCE_CLASS, CODE_BLOCK_SCROLL_CONTAINER_CLASS] },
+		children: [
+			{
+				type: 'element',
+				tagName: 'pre',
+				properties: {},
+				children: [code]
+			}
+		]
+	};
+}
+
 /**
 * Creates a block header with language label and action buttons.
 */
@@ -116,14 +169,17 @@ export function createScrollContainer(preElement: Element, scrollContainerClass:
 }

 /**
- * Creates a wrapper element with header and scroll container.
+ * Creates a wrapper element with header and scroll container. Extra children
+ * append after the scroll container, which lets a block carry a source view
+ * alongside its rendered output.
 */
 export function createWrapper(
 	header: Element,
 	preElement: Element,
 	wrapperClass: string,
 	scrollContainerClass: string,
-	additionalAttributes?: Record<string, string>
+	additionalAttributes?: Record<string, string>,
+	extraChildren: Element[] = []
 ): Element {
 	return {
 		type: 'element',
@@ -132,7 +188,7 @@ export function createWrapper(
 			className: [wrapperClass, RELATIVE_CLASS],
 			...additionalAttributes
 		} as Element['properties'],
-		children: [header, createScrollContainer(preElement, scrollContainerClass)]
+		children: [header, createScrollContainer(preElement, scrollContainerClass), ...extraChildren]
 	};
 }

@@ -19,12 +19,17 @@ import {
 	MERMAID_BLOCK_CLASS,
 	MERMAID_LANGUAGE,
 	MERMAID_SYNTAX_ATTR,
-	MERMAID_ID_ATTR
+	MERMAID_ID_ATTR,
+	DIAGRAM_VIEW_MODE_ATTR,
+	DIAGRAM_VIEW_RENDERED
 } from '$lib/constants';
+import type { DiagramPreData } from './pre-transform';
 import {
 	createBlockHeader,
 	createCopyButton,
 	createPreviewButton,
+	createToggleSourceButton,
+	createSourceView,
 	createWrapper,
 	generateBlockId
 } from './code-block-utils';
@@ -75,16 +80,23 @@ export const rehypeEnhanceMermaidBlocks: Plugin<[], Root> = () => {

 			const actions = [
 				createCopyButton(mermaidId, MERMAID_ID_ATTR, 'Copy mermaid syntax'),
+				createToggleSourceButton(mermaidId, MERMAID_ID_ATTR, 'Toggle mermaid source'),
 				createPreviewButton(mermaidId, MERMAID_ID_ATTR, 'Preview diagram')
 			];

 			const header = createBlockHeader(MERMAID_LANGUAGE, mermaidId, MERMAID_ID_ATTR, actions);
+			const preservedCode = (node.data as DiagramPreData | undefined)?.sourceCode;
+			const sourceView = createSourceView(preservedCode, diagramText, MERMAID_LANGUAGE);
 			const wrapper = createWrapper(
 				header,
 				node,
 				MERMAID_WRAPPER_CLASS,
 				MERMAID_SCROLL_CONTAINER_CLASS,
-				{ [MERMAID_ID_ATTR]: mermaidId }
+				{
+					[MERMAID_ID_ATTR]: mermaidId,
+					[DIAGRAM_VIEW_MODE_ATTR]: DIAGRAM_VIEW_RENDERED
+				},
+				[sourceView]
 			);

 			// Replace pre with wrapper in parent
@@ -18,12 +18,17 @@ import {
 	SVG_BLOCK_CLASS,
 	SVG_LANGUAGE,
 	SVG_SOURCE_ATTR,
-	SVG_ID_ATTR
+	SVG_ID_ATTR,
+	DIAGRAM_VIEW_MODE_ATTR,
+	DIAGRAM_VIEW_RENDERED
 } from '$lib/constants';
+import type { DiagramPreData } from './pre-transform';
 import {
 	createBlockHeader,
 	createCopyButton,
 	createPreviewButton,
+	createToggleSourceButton,
+	createSourceView,
 	createWrapper,
 	generateBlockId
 } from './code-block-utils';
@@ -65,13 +70,24 @@ export const rehypeEnhanceSvgBlocks: Plugin<[], Root> = () => {

 			const actions = [
 				createCopyButton(svgId, SVG_ID_ATTR, 'Copy svg source'),
+				createToggleSourceButton(svgId, SVG_ID_ATTR, 'Toggle svg source'),
 				createPreviewButton(svgId, SVG_ID_ATTR, 'Preview svg')
 			];

 			const header = createBlockHeader(SVG_LANGUAGE, svgId, SVG_ID_ATTR, actions);
-			const wrapper = createWrapper(header, node, SVG_WRAPPER_CLASS, SVG_SCROLL_CONTAINER_CLASS, {
-				[SVG_ID_ATTR]: svgId
-			});
+			const preservedCode = (node.data as DiagramPreData | undefined)?.sourceCode;
+			const sourceView = createSourceView(preservedCode, svgSource, SVG_LANGUAGE);
+			const wrapper = createWrapper(
+				header,
+				node,
+				SVG_WRAPPER_CLASS,
+				SVG_SCROLL_CONTAINER_CLASS,
+				{
+					[SVG_ID_ATTR]: svgId,
+					[DIAGRAM_VIEW_MODE_ATTR]: DIAGRAM_VIEW_RENDERED
+				},
+				[sourceView]
+			);

 			// Replace pre with wrapper in parent
 			(parent.children as ElementContent[])[index] = wrapper;
@@ -2,6 +2,15 @@ import type { Plugin } from 'unified';
 import type { Root, Element, ElementContent, Text } from 'hast';
 import { visit } from 'unist-util-visit';

+/**
+ * Metadata a diagram pre carries on its unist data field. The source code holds
+ * the highlighted code element captured before the pre became a render target,
+ * which the enhancer reuses to build a matching source view.
+ */
+export interface DiagramPreData {
+	sourceCode: Element;
+}
+
 /**
 * Recursively extracts all text content from a HAST node.
 * Handles nested elements (e.g., span wrappers from syntax highlighting).
@@ -69,7 +78,10 @@ export function createPreTransform(
 					properties: {
 						className: [targetClass]
 					},
-					children: [{ type: 'text', value: text } as Text]
+					children: [{ type: 'text', value: text } as Text],
+					// Keep the highlighted code element so the block can offer a source
+					// view that matches the app code blocks without re highlighting.
+					data: { sourceCode: codeElement } satisfies DiagramPreData
 				};

 				(parent.children as ElementContent[])[index] = pre;
@@ -0,0 +1,9 @@
+// Shared constants for diagram blocks (mermaid and svg) that toggle between a
+// rendered view and a source view. The wrapper carries the active mode, css
+// drives the visibility, the click handler only flips the attribute.
+
+export const DIAGRAM_VIEW_MODE_ATTR = 'data-view-mode';
+export const DIAGRAM_VIEW_RENDERED = 'rendered';
+export const DIAGRAM_VIEW_SOURCE = 'source';
+export const DIAGRAM_SOURCE_CLASS = 'diagram-source';
+export const TOGGLE_SOURCE_BTN_CLASS = 'toggle-source-btn';
@@ -39,3 +39,5 @@ export const MODALITY_LABELS = {
 export const COPY_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-copy-icon lucide-copy"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>`;

 export const PREVIEW_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-eye lucide-eye-icon"><path d="M2.062 12.345a1 1 0 0 1 0-.69C3.5 7.73 7.36 5 12 5s8.5 2.73 9.938 6.655a1 1 0 0 1 0 .69C20.5 16.27 16.64 19 12 19s-8.5-2.73-9.938-6.655"/><circle cx="12" cy="12" r="3"/></svg>`;
+
+export const CODE_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-code lucide-code-icon"><path d="m16 18 6-6-6-6"/><path d="m8 6-6 6 6 6"/></svg>`;
@@ -30,6 +30,7 @@ export * from './literal-html';
 export * from './markdown';
 export * from './mermaid-blocks';
 export * from './svg-blocks';
+export * from './diagram-blocks';
 export * from './max-bundle-size';
 export * from './mcp';
 export * from './mcp-form';
@@ -41,7 +41,7 @@ if (LLAMA_BUILD_BORINGSSL)
    set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")

    set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
-    set(BORINGSSL_VERSION "0.20260526.0" CACHE STRING "BoringSSL version")
+    set(BORINGSSL_VERSION "0.20260616.0" CACHE STRING "BoringSSL version")

    message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
Author	SHA1	Message	Date
Alessandro de Oliveira Faria (A.K.A.CABELO)	74ade52741	vendor : update BoringSSL to 0.20260616.0 (#24693 )	2026-06-16 20:24:28 +02:00
Pascal	c1304d7b28	ui: add source toggle to mermaid and svg blocks (#24652 ) * ui: add source toggle to mermaid and svg blocks Add a toggle button next to copy and preview that switches a rendered mermaid or svg block to its source code and back. The button is shared by both block types and the rendered view stays the default. The source view reuses the code block scroll container and the highlighted code element captured at transform time, so it matches the app code blocks without highlighting again. Make tall diagrams scroll like text code blocks: safe centering keeps the diagram centered when it fits and falls back to start alignment when it overflows, so the top stays reachable instead of clipping above. Keep the block header opaque and layered above the scrolled diagram, and ignore header clicks in the zoom handler, so a button click never falls through to the zoom dialog. * ui: transparent diagram block header, address review from @allozaur	2026-06-16 14:14:22 +02:00
Oliver Simons	02810c7aa8	Fix and restrict NVFP4 edge-cases in llama-graph (#24331 ) * Move post-GEMM MUL required for dequant b4 lora and bias add see https://github.com/ggml-org/llama.cpp/pull/23484 : 1. For lora, I would presume we want fully dequantized values before doing the residuals, but this depends on how the LORAs were generated. Literature tells me LORA happens post-mul but pre-bias add https://github.com/ggml-org/llama.cpp/pull/8332 2. For ModelOPT, bias-add should happen on [fully-dequantized values](https://github.com/NVIDIA/Model-Optimizer/blob/b49f9b9e2d747af992d78a3aa7f10efe5a8847e1/modelopt/torch/quantization/backends/nvfp4_gemm.py#L59-L64) * Restrict build_ffn for NVFP4 to supported combinations	2026-06-16 11:52:38 +02:00
Ruixiang Wang	a1824902b5	spec: add backend sampling support for eagle3 (#24655 )	2026-06-16 12:05:52 +03:00
Winston Ma	32120c10e3	vulkan: prefer host-visible memory buffers on UMA devices (#22930 ) * implement UMA host-visible memory * update based on 0cc4m's suggestion	2026-06-16 09:36:52 +02:00
Jeff Bolz	d5fb104293	vulkan: Support gated_delta_net with S_v=16 (#24581 )	2026-06-16 09:26:57 +02:00
Ruixiang Wang	635b65ad7a	spec: add spec metrics mean acceptance length and acceptance rate per position (#24536 ) * spec: add spec metrics mean acceptance length and acceptance per pos * fix as suggestion Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix as suggestion Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix as suggestion Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix as suggestions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-06-16 10:23:09 +03:00