server : add pidfile option

So we can track the pid of this process Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2025-06-17 21:47:53 +01:00
9 changed files with 139 additions and 83 deletions
@@ -3373,5 +3373,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));

+    add_opt(common_arg({ "--pidfile" }, "FILE", "path to PID file for server process",
+                       [](common_params & params, const std::string & value) { params.pidfile = value; })
+                .set_examples({ LLAMA_EXAMPLE_SERVER }));
+
    return ctx_arg;
 }
@@ -366,6 +366,7 @@ struct common_params {
    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
+    std::string pidfile       = "";  // path to PID file for server process                           // NOLINT
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
@@ -556,8 +556,11 @@ class TextModel(ModelBase):
            logger.info(f"gguf: experts used count = {n_experts_used}")

        if (head_dim := self.hparams.get("head_dim")) is not None:
-            self.gguf_writer.add_key_length(head_dim)
-            self.gguf_writer.add_value_length(head_dim)
+            # Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
+            # https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
+            if self.hparams.get("model_type") != "deepseek_v3":
+                self.gguf_writer.add_key_length(head_dim)
+                self.gguf_writer.add_value_length(head_dim)

        self.gguf_writer.add_file_type(self.ftype)
        logger.info(f"gguf: file type = {self.ftype}")
@@ -1898,7 +1901,9 @@ class LlamaModel(TextModel):
        hparams = self.hparams
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])

-        if (rope_dim := hparams.get("head_dim")) is None:
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

@@ -1980,8 +1985,7 @@ class LlamaModel(TextModel):
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
-                if (dim := self.hparams.get("head_dim")) is None:
-                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

                factor = rope_scaling.get("factor", 8.0)
@@ -2317,7 +2321,9 @@ class DeciModel(TextModel):
        hparams = self.hparams
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])

-        if (rope_dim := hparams.get("head_dim")) is None:
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

@@ -2357,8 +2363,7 @@ class DeciModel(TextModel):
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
-                if (dim := self.hparams.get("head_dim")) is None:
-                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

                factor = rope_scaling.get("factor", 8.0)
@@ -3676,7 +3681,9 @@ class InternLM3Model(TextModel):
        hparams = self.hparams
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])

-        if (rope_dim := hparams.get("head_dim")) is None:
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

@@ -5091,7 +5098,9 @@ class DeepseekModel(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        if (rope_dim := hparams.get("head_dim")) is None:
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

        self.gguf_writer.add_rope_dimension_count(rope_dim)
@@ -5981,8 +5990,7 @@ class ExaoneModel(TextModel):
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
-                if (dim := self.hparams.get("head_dim")) is None:
-                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

                factor = rope_scaling.get("factor", 8.0)
@@ -6094,8 +6102,7 @@ class BailingMoeModel(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        if (rope_dim := hparams.get("head_dim")) is None:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]

        self.gguf_writer.add_rope_dimension_count(rope_dim)
        rope_scaling = self.hparams.get("rope_scaling") or {}
@@ -6127,8 +6134,7 @@ class BailingMoeModel(TextModel):
        n_head = self.hparams["num_attention_heads"]
        n_kv_head = self.hparams.get("num_key_value_heads")
        n_embd = self.hparams["hidden_size"]
-        if (head_dim := self.hparams.get("head_dim")) is None:
-            head_dim = n_embd // n_head
+        head_dim = self.hparams.get("head_dim") or n_embd // n_head

        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)

@@ -368,8 +368,6 @@ if (MSVC)
        /wd4005  # Macro redefinition
        /wd4244  # Conversion from one type to another type, possible loss of data
        /wd4267  # Conversion from 'size_t' to a smaller type, possible loss of data
-        /wd4305  # Conversion from 'type1' to 'type2', possible loss of data
-        /wd4566  # Conversion from 'char' to 'wchar_t', possible loss of data
        /wd4996  # Disable POSIX deprecation warnings
        /wd4702  # Unreachable code warnings
    )
@@ -389,46 +387,4 @@ if (MSVC)
    disable_msvc_warnings(ggml-cpu-skylakex)
    disable_msvc_warnings(ggml-cpu-icelake)
    disable_msvc_warnings(ggml-cpu-alderlake)
-
-    if (GGML_BUILD_EXAMPLES)
-        disable_msvc_warnings(common-ggml)
-        disable_msvc_warnings(common)
-
-        disable_msvc_warnings(mnist-common)
-        disable_msvc_warnings(mnist-eval)
-        disable_msvc_warnings(mnist-train)
-
-        disable_msvc_warnings(gpt-2-ctx)
-        disable_msvc_warnings(gpt-2-alloc)
-        disable_msvc_warnings(gpt-2-backend)
-        disable_msvc_warnings(gpt-2-sched)
-        disable_msvc_warnings(gpt-2-quantize)
-        disable_msvc_warnings(gpt-2-batched)
-
-        disable_msvc_warnings(gpt-j)
-        disable_msvc_warnings(gpt-j-quantize)
-
-        disable_msvc_warnings(magika)
-        disable_msvc_warnings(yolov3-tiny)
-        disable_msvc_warnings(sam)
-
-        disable_msvc_warnings(simple-ctx)
-        disable_msvc_warnings(simple-backend)
-    endif()
-
-    if (GGML_BUILD_TESTS)
-        disable_msvc_warnings(test-mul-mat)
-        disable_msvc_warnings(test-arange)
-        disable_msvc_warnings(test-backend-ops)
-        disable_msvc_warnings(test-cont)
-        disable_msvc_warnings(test-conv-transpose)
-        disable_msvc_warnings(test-conv-transpose-1d)
-        disable_msvc_warnings(test-conv1d)
-        disable_msvc_warnings(test-conv2d)
-        disable_msvc_warnings(test-conv2d-dw)
-        disable_msvc_warnings(test-customop)
-        disable_msvc_warnings(test-dup)
-        disable_msvc_warnings(test-opt)
-        disable_msvc_warnings(test-pool)
-    endif ()
 endif()
@@ -144,15 +144,9 @@ if (Vulkan_FOUND)
                   ${VULKAN_SHADER_GEN_CMAKE_ARGS}

        BUILD_COMMAND   ${CMAKE_COMMAND} --build   . --config $<CONFIG>
-
-        # NOTE: When DESTDIR is set using Makefile generators and
-        # "make install" triggers the build step, vulkan-shaders-gen
-        # would be installed into the DESTDIR prefix, so it is unset
-        # to ensure that does not happen.
-
-        INSTALL_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR
-                        ${CMAKE_COMMAND} --install . --config $<CONFIG>
+        INSTALL_COMMAND ${CMAKE_COMMAND} --install . --config $<CONFIG>
    )
+    ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)

    set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
    set (_ggml_vk_genshaders_dir "${CMAKE_BINARY_DIR}/$<CONFIG>")
@@ -178,6 +172,8 @@ if (Vulkan_FOUND)

        DEPENDS ${_ggml_vk_shader_files}
                vulkan-shaders-gen
+                vulkan-shaders-gen-build
+                vulkan-shaders-gen-install

        COMMENT "Generate vulkan shaders"
    )
@@ -888,6 +888,12 @@ struct ggml_context {
    struct ggml_object * objects_end;
 };

+struct ggml_context_container {
+    bool used;
+
+    struct ggml_context context;
+};
+
 //
 // data types
 //
@@ -1 +1 @@
-8cda0a3c19f2c7dc493887353c42f6956bc268b1
+6a7d170c04789f6ebcf320ed03c1b16973f93bd7
@@ -333,7 +333,7 @@ int32_t llm_chat_apply_template(
            std::string role(message->role);
            if (role == "system") {
                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
-                system_prompt += trim(message->content);
+                system_prompt = trim(message->content);
                continue;
            }
            // in gemma, "assistant" is "model"
@@ -355,7 +355,7 @@ int32_t llm_chat_apply_template(
            std::string role(message->role);
            if (role == "system") {
                // there is no system message support, we will merge it with user prompt
-                system_prompt += message->content;
+                system_prompt = message->content;
                continue;
            } else if (role == "user") {
                ss << "Human: ";
@@ -14,22 +14,31 @@
 // mime type for sending response
 #define MIMETYPE_JSON "application/json; charset=utf-8"

+#include <signal.h>
+
+#include <atomic>
+#include <chrono>
+#include <cinttypes>
+#include <condition_variable>
+#include <cstddef>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+
 // auto generated files (see README.md for details)
 #include "index.html.gz.hpp"
 #include "loading.html.hpp"

-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <cstddef>
-#include <cinttypes>
-#include <deque>
-#include <memory>
-#include <mutex>
-#include <signal.h>
-#include <thread>
-#include <unordered_map>
-#include <unordered_set>
+#ifdef _WIN32
+#include <process.h>
+#define getpid _getpid
+#define pid_t int;
+#else
+#include <unistd.h>
+#endif

 using json = nlohmann::ordered_json;

@@ -3691,6 +3700,77 @@ inline void signal_handler(int signal) {
    shutdown_handler(signal);
 }

+static bool check_pid_alive(const pid_t pid) {
+    if (pid <= 0) {
+        return false;
+    }
+
+    // Process is alive or exists but is inaccessible
+    if (kill(pid, 0) == 0 || errno == EPERM) {
+        return true;  // Process is alive
+    }
+
+    return false;  // Process does not exist or other error
+}
+
+class PidFile {
+  public:
+    FILE *      file = nullptr;
+    std::string fname;
+    bool        rm = false;
+
+    FILE * open(const std::string & filename, const char * mode, const bool r = false) {
+        file  = ggml_fopen(filename.c_str(), mode);
+        fname = filename;
+        rm    = r;
+
+        return file;
+    }
+
+    void close() {
+        fclose(file);
+        file = nullptr;
+
+        if (rm) {
+            // Remove stale pidfile
+            unlink(fname.c_str());
+        }
+    }
+
+    ~PidFile() {
+        if (file) {
+            close();
+        }
+    }
+};
+
+static bool is_old_pid_alive(const std::string & filename) {
+    pid_t oldpid = 0;
+    PidFile  f;
+    if (f.open(filename, "r")) {
+        if (fscanf(f.file, "%d", &oldpid) == 1) {
+            if (check_pid_alive(oldpid)) {
+                LOG_ERR("Process already running with PID %d\n", oldpid);
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+static int create_pidfile(const std::string & pidfile, PidFile & f) {
+    if (!f.open(pidfile.c_str(), "w", true)) {
+        LOG_ERR("Unable to open pidfile %s: %s\n", pidfile.c_str(), strerror(errno));
+        return -1;
+    }
+
+    fprintf(f.file, "%d\n", getpid());
+    fflush(f.file);
+
+    return 0;
+}
+
 int main(int argc, char ** argv) {
    // own arguments required by this example
    common_params params;
@@ -3699,6 +3779,13 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    PidFile f;
+    if (!params.pidfile.empty()) {
+        if (is_old_pid_alive(params.pidfile) || create_pidfile(params.pidfile, f)) {
+            return 1;
+        }
+    }
+
    common_init();

    // struct that contains llama context and inference