ggml : add asserts (#14720 )

* ggml : add asserts ggml-ci * cont : fix constant type Co-authored-by: Diego Devesa <slarengh@gmail.com> --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
server : pre-calculate EOG logit biases (#14721 )
2026-07-01 10:07:44 +02:00 · 2025-07-16 14:43:32 +03:00 · 2025-07-16 14:04:12 +03:00 · 2025-07-16 12:12:22 +02:00
6 changed files with 24 additions and 16 deletions
@@ -1005,15 +1005,21 @@ struct common_init_result common_init_from_params(common_params & params) {
        params.sampling.ignore_eos = false;
    }

-    if (params.sampling.ignore_eos) {
-        for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
-            if (llama_vocab_is_eog(vocab, i)) {
-                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
-                params.sampling.logit_bias.push_back({i, -INFINITY});
-            }
+    // initialize once
+    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+        if (llama_vocab_is_eog(vocab, i)) {
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
+            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }

+    if (params.sampling.ignore_eos) {
+        // add EOG biases to the active set of logit biases
+        params.sampling.logit_bias.insert(
+                params.sampling.logit_bias.end(),
+                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
+    }
+
    if (params.sampling.penalty_last_n == -1) {
        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
        params.sampling.penalty_last_n = llama_n_ctx(lctx);
@@ -177,7 +177,8 @@ struct common_params_sampling {
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
    std::set<llama_token>               preserved_tokens;

-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
+    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

    // print the parameters into a string
    std::string print() const;
@@ -4015,6 +4015,9 @@ static void ggml_compute_forward_rms_norm_f32(

                const float scale = 1.0f/sqrtf(mean + eps);

+                // if you hit this, likely you got an inf somewhere earlier
+                assert(scale > 0.0f);
+
                ggml_vec_scale_f32(ne00, y, scale);
            }
        }
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
    for (int i = np; i < n; ++i) {
        sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
    }
+
+    // if you hit this, you are likely running outside the FP range
+    assert(!isnan(sumf) && !isinf(sumf));
 #else
    for (int i = 0; i < n; ++i) {
        sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
@@ -15763,6 +15763,7 @@ private:
        cb(zx, "mamba_in_proj", il);
        // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
        zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
+        zx = ggml_cont(ctx0, zx);
        zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
        cb(zx, "mamba_in_proj_out", il);

@@ -15780,7 +15781,6 @@ private:
        // conv1d
        {
            // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
-            x = ggml_view_2d(ctx0, x, d_inner, n_seq_tokens * n_seqs, d_inner * x->nb[0], 0);
            ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
            cb(conv_x, "mamba_conv1d_input", il);

@@ -473,12 +473,9 @@ struct server_task {

            params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
            if (params.sampling.ignore_eos) {
-                for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
-                    if (llama_vocab_is_eog(vocab, i)) {
-                        //SRV_DBG("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(ctx, i).c_str(), -INFINITY);
-                        params.sampling.logit_bias.push_back({i, -INFINITY});
-                    }
-                }
+                params.sampling.logit_bias.insert(
+                        params.sampling.logit_bias.end(),
+                        defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
            }
        }

@@ -1906,7 +1903,6 @@ struct server_context {

    bool clean_kv_cache = true;
    bool add_bos_token  = true;
-    bool has_eos_token  = false;

    int32_t n_ctx; // total context for all clients / slots

@@ -1965,7 +1961,6 @@ struct server_context {
        n_ctx = llama_n_ctx(ctx);

        add_bos_token = llama_vocab_get_add_bos(vocab);
-        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;

        if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
Author	SHA1	Message	Date
Georgi Gerganov	64978340b0	ggml : add asserts (#14720 ) * ggml : add asserts ggml-ci * cont : fix constant type Co-authored-by: Diego Devesa <slarengh@gmail.com> --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>	2025-07-16 14:43:32 +03:00
Georgi Gerganov	6ffd4e9c44	server : pre-calculate EOG logit biases (#14721 ) ggml-ci	2025-07-16 14:04:12 +03:00
Shunta Saito	e4841d24d3	llama : fix parallel processing for plamo2 (#14716 )	2025-07-16 12:12:22 +02:00