server : slots monitoring endpoint (#5550 )

sampling : do not set min_keep to n_probs (#5564 )
cmake : fix GGML_USE_SYCL typo (#5555 )
2026-06-16 10:46:43 +02:00 · 2024-02-18 19:39:57 +02:00 · 2024-02-18 19:38:06 +02:00 · 2024-02-18 19:17:00 +02:00 · 2024-02-18 18:31:28 +02:00 · 2024-02-18 18:30:09 +02:00
4 changed files with 144 additions and 9 deletions
@@ -526,7 +526,7 @@ if (LLAMA_SYCL)

    message(STATUS "SYCL found")

-    add_compile_definitions(GML_USE_SYCL)
+    add_compile_definitions(GGML_USE_SYCL)

    if (LLAMA_SYCL_F16)
        add_compile_definitions(GGML_SYCL_F16)
@@ -121,7 +121,7 @@ static void sampler_queue(
                   struct llama_context * ctx_main,
            const llama_sampling_params & params,
                 llama_token_data_array & cur_p,
-                                 size_t & min_keep) {
+                                 size_t   min_keep) {
    const float         temp              = params.temp;
    const float         dynatemp_range    = params.dynatemp_range;
    const float         dynatemp_exponent = params.dynatemp_exponent;
@@ -248,10 +248,7 @@ static llama_token llama_sampling_sample_impl(
            llama_sample_temp(ctx_main, &cur_p, temp);
            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
        } else {
-            // temperature sampling
-            size_t min_keep = std::max(1, params.n_probs);
-
-            sampler_queue(ctx_main, params, cur_p, min_keep);
+            sampler_queue(ctx_main, params, cur_p, 1);

            id = llama_sample_token(ctx_main, &cur_p);

@@ -39,6 +39,8 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
+- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
+- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.

 ## Build

@@ -135,6 +137,7 @@ node index.js
  - `{"status": "loading model"}` if the model is still being loaded.
  - `{"status": "error"}` if the model failed to load.
  - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
+  - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available

 - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

@@ -379,6 +382,69 @@ Notice that each `probs` is an array of length `n_probs`.
    }'
    ```

+- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
+
+### Result JSON
+
+```json
+[
+    {
+        "dynatemp_exponent": 1.0,
+        "dynatemp_range": 0.0,
+        "frequency_penalty": 0.0,
+        "grammar": "",
+        "id": 0,
+        "ignore_eos": false,
+        "logit_bias": [],
+        "min_p": 0.05000000074505806,
+        "mirostat": 0,
+        "mirostat_eta": 0.10000000149011612,
+        "mirostat_tau": 5.0,
+        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
+        "n_ctx": 2048,
+        "n_keep": 0,
+        "n_predict": 100000,
+        "n_probs": 0,
+        "next_token": {
+            "has_next_token": true,
+            "n_remain": -1,
+            "num_tokens_predicted": 0,
+            "stopped_eos": false,
+            "stopped_limit": false,
+            "stopped_word": false,
+            "stopping_word": ""
+        },
+        "penalize_nl": true,
+        "penalty_prompt_tokens": [],
+        "presence_penalty": 0.0,
+        "prompt": "Say hello to llama.cpp",
+        "repeat_last_n": 64,
+        "repeat_penalty": 1.100000023841858,
+        "samplers": [
+            "top_k",
+            "tfs_z",
+            "typical_p",
+            "top_p",
+            "min_p",
+            "temperature"
+        ],
+        "seed": 42,
+        "state": 1,
+        "stop": [
+            "\n"
+        ],
+        "stream": false,
+        "task_id": 0,
+        "temperature": 0.0,
+        "tfs_z": 1.0,
+        "top_k": 40,
+        "top_p": 0.949999988079071,
+        "typical_p": 1.0,
+        "use_penalty_prompt_tokens": false
+    }
+]
+```
+
 ## More examples

 ### Change system prompt on runtime
@@ -41,6 +41,7 @@ struct server_params
    int32_t port = 8080;
    int32_t read_timeout = 600;
    int32_t write_timeout = 600;
+    bool slots_endpoint = true;
 };

 bool server_verbose = false;
@@ -159,6 +160,7 @@ struct llama_client_slot
    int32_t n_decoded   = 0;
    int32_t n_remaining = -1;
    int32_t i_batch     = -1;
+    int32_t n_predict   = -1;

    int32_t num_prompt_tokens           = 0;
    int32_t num_prompt_tokens_processed = 0;
@@ -410,6 +412,7 @@ struct llama_server_context

            slot.id = i;
            slot.n_ctx = n_ctx_slot;
+            slot.n_predict = params.n_predict;

            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);

@@ -546,6 +549,15 @@ struct llama_server_context
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);

+        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
+            // Might be better to reject the request with a 400 ?
+            LOG_WARNING("Max tokens to predict exceeds server configuration", {
+                {"params.n_predict", slot->params.n_predict},
+                {"slot.n_predict", slot->n_predict},
+            });
+            slot->params.n_predict = slot->n_predict;
+        }
+
        // infill
        if (data.count("input_prefix") != 0)
        {
@@ -1053,6 +1065,7 @@ struct llama_server_context

        return json {
            {"n_ctx",             slot.n_ctx},
+            {"n_predict",         slot.n_predict},
            {"model",             params.model_alias},
            {"seed",              slot.params.seed},
            {"temperature",       slot.sparams.temp},
@@ -1914,14 +1927,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
    printf("  --log-disable             disables logging to a file.\n");
+    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
    printf("\n");
+    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
    printf("  --chat-template FORMAT_NAME");
-    printf("                            set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
+    printf("                            set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
    printf("\n");
 }

@@ -2361,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            log_set_target(stdout);
            LOG_INFO("logging to file is disabled.", {});
        }
+        else if (arg == "--slots-endpoint-disable")
+        {
+            sparams.slots_endpoint = false;
+        }
        else if (arg == "--chat-template")
        {
            if (++i >= argc)
@@ -2565,8 +2584,35 @@ int main(int argc, char **argv)
        server_state current_state = state.load();
        switch(current_state) {
            case SERVER_STATE_READY:
-                res.set_content(R"({"status": "ok"})", "application/json");
-                res.status = 200; // HTTP OK
+                if (llama.all_slots_are_idle) {
+                    res.set_content(R"({"status": "ok"})", "application/json");
+                    res.status = 200; // HTTP OK
+                } else {
+                    int available_slots = 0;
+                    int processing_slots = 0;
+                    for (llama_client_slot & slot : llama.slots) {
+                        if (slot.available()) {
+                            available_slots++;
+                        } else {
+                            processing_slots++;
+                        }
+                    }
+                    if (available_slots > 0) {
+                        json health = {
+                                {"status",           "ok"},
+                                {"slots_idle",       available_slots},
+                                {"slots_processing", processing_slots}};
+                        res.set_content(health.dump(), "application/json");
+                        res.status = 200; // HTTP OK
+                    } else {
+                        json health = {
+                                {"status",           "no slot available"},
+                                {"slots_idle",       available_slots},
+                                {"slots_processing", processing_slots}};
+                        res.set_content(health.dump(), "application/json");
+                        res.status = 503; // HTTP Service Unavailable
+                    }
+                }
                break;
            case SERVER_STATE_LOADING_MODEL:
                res.set_content(R"({"status": "loading model"})", "application/json");
@@ -2579,6 +2625,32 @@ int main(int argc, char **argv)
        }
    });

+    if (sparams.slots_endpoint) {
+        svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
+            json slots;
+            for (llama_client_slot & slot : llama.slots) {
+                json slot_data = llama.get_formated_generation(slot);
+                slot_data["id"] = slot.id;
+                slot_data["task_id"] = slot.task_id;
+                slot_data["state"] = slot.state;
+                slot_data["prompt"] = slot.prompt;
+                slot_data["next_token"] = {
+                        {"has_next_token", slot.has_next_token},
+                        {"n_remain", slot.n_remaining},
+                        {"num_tokens_predicted", slot.n_decoded},
+                        {"stopped_eos", slot.stopped_eos},
+                        {"stopped_word", slot.stopped_word},
+                        {"stopped_limit", slot.stopped_limit},
+                        {"stopping_word", slot.stopping_word},
+                };
+
+                slots.push_back(slot_data);
+            }
+            res.set_content(slots.dump(), "application/json");
+            res.status = 200; // HTTP OK
+        });
+    }
+
    svr.set_logger(log_server_request);

    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
Author	SHA1	Message	Date
Pierrick Hymbert	c145f8a132	server : slots monitoring endpoint (#5550 )	2024-02-18 19:39:57 +02:00
Georgi Gerganov	689a091bbe	sampling : do not set min_keep to n_probs (#5564 )	2024-02-18 19:38:06 +02:00
Georgi Gerganov	f3f28c5395	cmake : fix GGML_USE_SYCL typo (#5555 )	2024-02-18 19:17:00 +02:00
Pierrick Hymbert	e75c6279d1	server : enhanced health endpoint (#5548 ) * server: enrich health endpoint with available slots, return 503 if not slots are available * server: document new status no slot available in the README.md	2024-02-18 18:31:28 +02:00
Pierrick Hymbert	36376abe05	server : --n-predict option document and cap to max value (#5549 ) * server: document --n-predict * server: ensure client request cannot override n_predict if set * server: fix print usage LF in new --n-predict option	2024-02-18 18:30:09 +02:00