server : graceful server shutdown (#5244 )

This updates the server queue to support graceful shutdown of the server on signals.
common : fix ub (#5530 )
2026-06-16 10:46:43 +02:00 · 2024-02-18 18:23:16 +02:00 · 2024-02-18 18:21:52 +02:00 · 2024-02-18 18:20:12 +02:00 · 2024-02-18 18:19:23 +02:00
13 changed files with 88 additions and 56 deletions
@@ -1741,7 +1741,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l

    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
+    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
@@ -1750,7 +1750,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);

    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
@@ -1801,7 +1801,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
            if (cs_curr[j] < 0) { continue; }
            if (seqs.find(cs_curr[j]) == seqs.end()) {
                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                seqs[cs_curr[j]] = seqs.size();
+                const size_t sz = seqs.size();
+                seqs[cs_curr[j]] = sz;
            }
        }
        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
@@ -92,7 +92,7 @@ int main(int argc, char ** argv) {

    const int n_ctx    = llama_n_ctx(ctx);

-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
@@ -325,14 +325,14 @@ struct train_params {
 };

 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }

 static void init_model(struct my_llama_model * model) {
@@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) {
    model->train_tokens = 0;

    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);

    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);

    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);

    // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);

-    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);

-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);

    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
    ggml_set_name(model->norm,           "norm.weight");
@@ -53,7 +53,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
 5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./convert.py ../llava-v1.5-7b
+python ./convert.py ../llava-v1.5-7b --skip-unknown
 ```

 Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
@@ -19,10 +19,6 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_project
 projector = {name: checkpoint[name].float() for name in mm_tensors}
 torch.save(projector, f"{args.model}/llava.projector")

-# remove these tensors from the checkpoint and save it again
-for name in mm_tensors:
-    del checkpoint[name]
-
 # BakLLaVA models contain CLIP tensors in it
 clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
 if len(clip_tensors) > 0:
@@ -39,7 +35,7 @@ if len(clip_tensors) > 0:
            f.write("{}\n")


-torch.save(checkpoint, path)
+    torch.save(checkpoint, path)

 print("Done!")
 print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
@@ -1623,7 +1623,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    uint32_t n_ctx;
    in.read((char *)&n_ctx, sizeof(n_ctx));
    if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
+        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
    }

@@ -28,6 +28,7 @@
 #include <chrono>
 #include <condition_variable>
 #include <atomic>
+#include <signal.h>

 using json = nlohmann::json;

@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
    }
 }

+std::function<void(int)> shutdown_handler;
+inline void signal_handler(int signal) { shutdown_handler(signal); }
+
 int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
@@ -3128,8 +3132,25 @@ int main(int argc, char **argv)
        std::placeholders::_2,
        std::placeholders::_3
    ));
-    llama.queue_tasks.start_loop();

+    shutdown_handler = [&](int) {
+        llama.queue_tasks.terminate();
+    };
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+    llama.queue_tasks.start_loop();
+    svr.stop();
    t.join();

    llama_backend_free();
@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
+    bool running;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
@@ -278,9 +279,18 @@ struct llama_server_queue {
        queue_tasks_deferred.clear();
    }

-    // Start the main loop. This call is blocking
-    [[noreturn]]
+    // end the start_loop routine
+    void terminate() {
+        {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            running = false;
+        }
+        condition_tasks.notify_all();
+    }
+
+    // Start the main loop.
    void start_loop() {
+        running = true;
        while (true) {
            // new task arrived
            LOG_VERBOSE("have new task", {});
@@ -324,8 +334,12 @@ struct llama_server_queue {
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
+                    if (!running) {
+                        LOG_VERBOSE("ending start_loop", {});
+                        return;
+                    }
                    condition_tasks.wait(lock, [&]{
-                        return !queue_tasks.empty();
+                        return (!queue_tasks.empty() || !running);
                    });
                }
            }
@@ -111,13 +111,13 @@ static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
 static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";

 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }

 static void set_param_model(struct my_llama_model * model) {
@@ -17909,7 +17909,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *

                ptr += ggml_nbytes(tensor);

-                fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
+                fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
            }
        }

@@ -18012,7 +18012,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *

                result->nodes[i] = tensor;

-                fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
+                fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
            }
        }
    }
@@ -38,8 +38,8 @@ term  ::= [0-9]+)""";
        // pretty print error message before asserting
        if (expected_pair.first != key || expected_pair.second != value)
        {
-            fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
+            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
+            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
            fprintf(stderr, "expected_pair != actual_pair\n");
        }

@@ -96,9 +96,9 @@ term  ::= [0-9]+)""";
            // pretty print error message before asserting
            if (expected_element.type != element.type || expected_element.value != element.value)
            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
+                fprintf(stderr, "index: %u\n", index);
+                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
                fprintf(stderr, "expected_element != actual_element\n");
            }

@@ -144,8 +144,8 @@ term  ::= [0-9]+)""";
        // pretty print error message before asserting
        if (expected_pair.first != key || expected_pair.second != value)
        {
-            fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
+            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
+            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
            fprintf(stderr, "expected_pair != actual_pair\n");
        }

@@ -235,9 +235,9 @@ term  ::= [0-9]+)""";
            // pretty print error message before asserting
            if (expected_element.type != element.type || expected_element.value != element.value)
            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
+                fprintf(stderr, "index: %u\n", index);
+                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
                fprintf(stderr, "expected_element != actual_element\n");
            }

@@ -180,8 +180,8 @@ int main()
            if (expected_element.type != element->type || expected_element.value != element->value)
            {
                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value);
+                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
                fprintf(stderr, "expected_element != actual_element\n");
            }
Author	SHA1	Message	Date
Daniel Hiltgen	66c1968f7a	server : graceful server shutdown (#5244 ) This updates the server queue to support graceful shutdown of the server on signals.	2024-02-18 18:23:16 +02:00
Georgi Gerganov	1dcc3fde00	common : fix ub (#5530 )	2024-02-18 18:21:52 +02:00
Herman Semenov	5d3de51f97	ggml, common, examples, tests : fixed type arguments in printf (#5528 )	2024-02-18 18:20:12 +02:00
Daniel Bevenius	fc0c8d286a	llava : update surgery script to not remove tensors (#5536 ) This commit updates the surgery script to not remove the tensors from the model file. For this to work the `--skip-unknown` flag is added as an argument to the convert.py script in README.md. The motivation for this change is that the surgery script currently removes the projector tensors from the model file. If the model was checked out from a repository, the model file will have been updated and have to be checked out again to reset this effect. If this can be avoided I think it would be preferable. I did not perform this change for BakLLaVA models as I am not sure how that part works.	2024-02-18 18:19:23 +02:00