ggml : optimize AMX (#24806 )

Flatten the partition over n_batch * M so every thread participates in the quantization | CPU | Model | Test | t/s OLD | t/s NEW | Speedup | |:--------------------------------|:------------------------------|:-------|----------:|----------:|----------:| | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw | pp512 | 730.71 | 779.86 | 1.07 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw | tg128 | 87.88 | 86.79 | 0.99 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | pp512 | 725.09 | 1023.31 | 1.41 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | tg128 | 83.64 | 83.62 | 1.00 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0 | pp512 | 820.51 | 924.05 | 1.13 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0 | tg128 | 90.59 | 92.46 | 1.02 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1 | pp512 | 776.88 | 872.79 | 1.12 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1 | tg128 | 89.39 | 90.94 | 1.02 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M | pp512 | 719.28 | 1009.27 | 1.40 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M | tg128 | 80.62 | 80.86 | 1.00 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S | pp512 | 732.29 | 1077.29 | 1.47 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S | tg128 | 86.42 | 83.53 | 0.97 | Signed-off-by: Adrien Gallouët <angt@huggingface.co>
convert : more consistent handling of rope_parameters (#24833 )
2026-06-20 12:47:39 +02:00 · 2026-06-20 13:43:06 +03:00 · 2026-06-20 13:42:36 +03:00 · 2026-06-20 08:12:32 +09:00 · 2026-06-20 01:02:26 +02:00 · 2026-06-19 23:26:54 +02:00
38 changed files with 385 additions and 218 deletions
@@ -17,6 +17,7 @@
 #   define NOMINMAX
 #endif
 #include <windows.h>
+#include <shellapi.h>
 #endif

 #define JSON_ASSERT GGML_ASSERT
@@ -302,7 +303,6 @@ static handle_model_result common_params_handle_model(struct common_params_model

    if (!model.docker_repo.empty()) {
        model.path = common_docker_resolve_model(model.docker_repo);
-        model.name = model.docker_repo;
    } else if (!model.hf_repo.empty()) {
        // If -m was used with -hf, treat the model "path" as the hf_file to download
        if (model.hf_file.empty() && !model.path.empty()) {
@@ -322,7 +322,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
            throw std::runtime_error("failed to download model from Hugging Face");
        }

-        model.name = model.hf_repo;
        model.path = download_result.model_path;

        if (!download_result.mmproj_path.empty()) {
@@ -893,7 +892,44 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
    return true;
 }

+#ifdef _WIN32
+struct utf8_argv {
+    std::vector<std::string> buf;
+    std::vector<char*> ptrs;
+};
+
+static utf8_argv make_utf8_argv() {
+    utf8_argv out;
+    int wargc = 0;
+    LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc);
+    if (!wargv) return out;
+
+    out.buf.reserve(wargc);
+    for (int i = 0; i < wargc; ++i) {
+        int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr);
+        if (n <= 0) { out.buf.emplace_back(); continue; }
+        auto& s = out.buf.emplace_back();
+        s.resize(static_cast<size_t>(n - 1));
+        (void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr);
+    }
+    LocalFree(wargv);
+
+    out.ptrs.reserve(out.buf.size() + 1);
+    for (auto& s : out.buf) out.ptrs.push_back(s.data());
+    out.ptrs.push_back(nullptr);
+    return out;
+}
+#endif
+
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+#ifdef _WIN32
+    auto utf8 = make_utf8_argv();
+    if (!utf8.ptrs.empty()) {
+        argc = static_cast<int>(utf8.buf.size());
+        argv = utf8.ptrs.data();
+    }
+#endif
+
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params

@@ -2911,7 +2947,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
    add_opt(common_arg(
        {"--api-key-file"}, "FNAME",
-        "path to file containing API keys (default: none)",
+        "path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
        [](common_params & params, const std::string & value) {
            std::ifstream key_file(value);
            if (!key_file) {
@@ -2919,7 +2955,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
            std::string key;
            while (std::getline(key_file, key)) {
-                if (!key.empty()) {
+                if (!key.empty() && key[0] != '#') {
                    params.api_keys.push_back(key);
                }
            }
@@ -1074,6 +1074,18 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
    return files;
 }

+std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) {
+#ifdef _WIN32
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+    if (!wlen) { return std::ifstream(); }
+    std::vector<wchar_t> wfname(wlen);
+    (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
+    return std::ifstream(wfname.data(), mode);
+#else
+    return std::ifstream(fname, mode);
+#endif
+}
+
 //
 // TTY utils
 //
@@ -295,7 +295,16 @@ struct common_params_model {
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
    std::string hf_file     = ""; // HF file                                                // NOLINT
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
-    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
+
+    std::string get_name() {
+        if (!hf_repo.empty()) {
+            return hf_repo;
+        }
+        if (!docker_repo.empty()) {
+            return docker_repo;
+        }
+        return path;
+    }
 };

 // draft-model-based speculative decoding parameters
@@ -842,6 +851,9 @@ struct common_file_info {
 };
 std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);

+// fs open, also handle UTF8 on Windows
+std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode);
+
 //
 // TTY utils
 //
@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
        if (rope_dim := hparams.get("head_dim")) is None:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
@@ -1119,8 +1119,10 @@ class TextModel(ModelBase):

        rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
+        partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
+        original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)

-        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
+        # Ensure global params are mirrored in rope_parameters
        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
            if local_rope_theta is not None:
                self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@@ -1128,6 +1130,10 @@ class TextModel(ModelBase):
                self.rope_parameters["rope_theta"] = rope_theta
            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
                self.rope_parameters["rope_type"] = rope_type
+            if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
+                self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+            if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
+                self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings

    @classmethod
    def __init_subclass__(cls):
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
            rope_dim = self.hparams["attention_dim"]
        else:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):

        assert (hparams["activation_function"] == "silu")

-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
+        rotary_factor = self.rope_parameters.get("partial_rotary_factor")
        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))

@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
                factor = rope_params.get("factor", 16.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
            self.gguf_writer.add_head_count_kv(value_arr)

        # handle n_rot differently for global vs swa layers
-        partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
        n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
        n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
        self.gguf_writer.add_rope_dimension_count(n_rot_full)
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
            )
        self.gguf_writer.add_rope_dimension_count(
-            int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
+            int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
        )

        # MoE parameters - Use only routed expert count (shared experts handled separately)
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
        super().set_gguf_parameters()

        rope_dim = self.hparams["qk_rope_head_dim"]
-        partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))

        # NextN/MTP prediction layers
@@ -289,7 +289,7 @@ class LlamaModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])

-        rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
+        rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
        self.gguf_writer.add_rope_dimension_count(rope_dim)

        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
@@ -32,11 +32,9 @@ class MiniCPMModel(TextModel):
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]

-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if long_factors or short_factors:
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel):
        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if long_factors or short_factors:
            rope_dims = self.hparams["qk_rope_head_dim"]

-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -125,17 +125,18 @@ class NemotronModel(TextModel):
        self.gguf_writer.add_layer_norm_eps(f_norm_eps)

        # * Partial RoPE
-        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
+        rot_pct = self.rope_parameters["partial_rotary_factor"]
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)

        # * RopeScaling for Nemotron
-        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
+        factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
+        if factor is None:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        else:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
+            self.gguf_writer.add_rope_scaling_factor(factor)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2

    def set_gguf_parameters(self):
-        rot_pct = self.find_hparam(["partial_rotary_factor"])
+        rot_pct = self.rope_parameters["partial_rotary_factor"]
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])

@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
+        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        self.gguf_writer.add_context_length(max_pos_embds)
@@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel):
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
+        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        # write rope scaling for long context (128k) model
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is None:
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if not long_factors:
            return

        scale = max_pos_embds / orig_max_pos_embds

-        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
+        rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')

@@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel):

        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)

-        long_factors = rope_scaling.get('long_factor', None)
-        short_factors = rope_scaling.get('short_factor', None)
-
        if long_factors is None or short_factors is None:
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
        self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
        if (rope_dim := self.hparams.get("head_dim")) is None:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))

    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
+        rotary_factor = self.rope_parameters["partial_rotary_factor"]
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
        factor = float(rope_params.get("factor", 8.0))
        low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
        high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
-        old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
+        old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))

        low_freq_wavelen = old_context_len / low_freq_factor
        high_freq_wavelen = old_context_len / high_freq_factor
@@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);

-            parallel_for_ggml(params, n_batch, [&](int begin, int end) {
-                for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
+            parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
+                for (int idx = begin; idx < end; ++idx) {
+                    int batch_idx = idx / M;
+                    int m         = idx % M;
                    int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
                    const float * A_data = (const float *)((const char *)src1->data + src1_offset);
                    char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
-
-                    for (int m = 0; m < M; ++m) {
-                        from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
-                    }
+                    from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
                }
            });
        });
@@ -3788,7 +3788,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
    ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
 }

-static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
+static void ggml_backend_webgpu_request_adapter(wgpu::Instance & instance, wgpu::Adapter & adapter) {
    wgpu::RequestAdapterOptions options = {};

 #ifndef __EMSCRIPTEN__
@@ -3800,17 +3800,20 @@ static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
    options.nextInChain                   = &adapterTogglesDesc;
 #endif

-    ctx->webgpu_global_ctx->instance.WaitAny(
-        ctx->webgpu_global_ctx->instance.RequestAdapter(
-            &options, wgpu::CallbackMode::AllowSpontaneous,
-            [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
-                if (status != wgpu::RequestAdapterStatus::Success) {
-                    GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
-                    return;
-                }
-                ctx->webgpu_global_ctx->adapter = std::move(adapter);
-            }),
-        UINT64_MAX);
+    instance.WaitAny(instance.RequestAdapter(
+                         &options, wgpu::CallbackMode::AllowSpontaneous,
+                         [&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
+                             if (status != wgpu::RequestAdapterStatus::Success) {
+                                 GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
+                                 return;
+                             }
+                             adapter = std::move(_adapter);
+                         }),
+                     UINT64_MAX);
+}
+
+static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
+    ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, ctx->webgpu_global_ctx->adapter);
    GGML_ASSERT(ctx->webgpu_global_ctx->adapter != nullptr);

    ctx->webgpu_global_ctx->adapter.GetLimits(&ctx->webgpu_global_ctx->capabilities.limits);
@@ -4543,20 +4546,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
    // Probe for adapter support
    wgpu::Adapter adapter;
    if (ctx->webgpu_global_ctx->instance != nullptr) {
-        wgpu::RequestAdapterOptions options = {};
-
-        // probe for adapter support
-        ctx->webgpu_global_ctx->instance.WaitAny(
-            ctx->webgpu_global_ctx->instance.RequestAdapter(
-                &options, wgpu::CallbackMode::AllowSpontaneous,
-                [&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
-                    if (status != wgpu::RequestAdapterStatus::Success) {
-                        GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
-                        return;
-                    }
-                    adapter = std::move(_adapter);
-                }),
-            UINT64_MAX);
+        ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, adapter);
    }

    // WebGPU backend requires f16 support and, on native, implicit device synchronization.
@@ -600,18 +600,15 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
    // convert fname (UTF-8)
    wchar_t * wfname = ggml_mbstowcs(fname);
    if (wfname) {
-        // convert mode (ANSI)
-        wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
-        wchar_t * wmode_p = wmode;
-        do {
-            *wmode_p++ = (wchar_t)*mode;
-        } while (*mode++);
-
-        // open file
-        file = _wfopen(wfname, wmode);
+        // convert mode (UTF-8)
+        wchar_t * wmode = ggml_mbstowcs(mode);
+        if (wmode) {
+            // open file
+            file = _wfopen(wfname, wmode);
+            GGML_FREE(wmode);
+        }

        GGML_FREE(wfname);
-        GGML_FREE(wmode);
    }

    return file;
@@ -202,7 +202,7 @@ struct cli_context {

    // TODO: support remote files in the future (http, https, etc)
    std::string load_input_file(const std::string & fname, bool is_media) {
-        std::ifstream file(fname, std::ios::binary);
+        std::ifstream file = fs_open_ifstream(fname, std::ios::binary);
        if (!file) {
            return "";
        }
@@ -13,6 +13,14 @@
 #include <sstream>
 #include <vector>
 #include <memory>
+#include <fstream>
+
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif

 // Internal header for clip.cpp

@@ -661,6 +669,22 @@ struct clip_image_f32_batch {
 // common utils
 //

+#ifdef _WIN32
+static std::ifstream open_ifstream_binary(const std::string & fname) {
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+    if (!wlen) {
+        throw std::runtime_error("failed to convert filename to UTF-16: " + fname);
+    }
+    std::vector<wchar_t> wfname(wlen);
+    (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
+    return std::ifstream(wfname.data(), std::ios::binary);
+}
+#else
+static std::ifstream open_ifstream_binary(const std::string & fname) {
+    return std::ifstream(fname, std::ios::binary);
+}
+#endif
+
 static std::string string_format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@@ -1752,7 +1752,7 @@ struct clip_model_loader {
        std::map<std::string, size_t> tensor_offset;
        std::vector<ggml_tensor *> tensors_to_load;

-        auto fin = std::ifstream(fname, std::ios::binary);
+        auto fin = open_ifstream_binary(fname);
        if (!fin) {
            throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
        }
@@ -396,6 +396,9 @@ int main(int argc, char ** argv) {

    int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;

+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
    // Ctrl+C handling
    {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -582,13 +582,29 @@ mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx,
 }

 mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
-    std::vector<unsigned char> buf;
+#ifdef _WIN32
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
+    if (!wlen) {
+        LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname);
+        return {nullptr, nullptr};
+    }
+    std::vector<wchar_t> wfname(wlen);
+    wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wfname.data(), wlen);
+    if (!wlen) {
+        LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname);
+        return {nullptr, nullptr};
+    }
+    FILE * f = _wfopen(wfname.data(), L"rb");
+#else
    FILE * f = fopen(fname, "rb");
+#endif
    if (!f) {
        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
        return {nullptr, nullptr};
    }

+    std::vector<unsigned char> buf;
+
    fseek(f, 0, SEEK_END);
    long file_size = ftell(f);
    fseek(f, 0, SEEK_SET);
@@ -180,6 +180,17 @@ That requires `JSON.stringify` when formatted to message content:
 }
 ```

+### Router mode: how child <--> router communicates
+
+Upon spawning a new child process using `subprocess`, both child and router listen to the stdout/stderr (combined)
+
+For the direction from child to router:
+- Generic messages are logs, it will be forwarded to router's stdout
+- Special state update messages are prefixed by `cmd_child_to_router:state:`, followed by a JSON. See `server_models::handle_child_state` for more
+
+For the direction from router to child:
+- When server sends `cmd_router_to_child:exit`, the child should exit gracefully --> if after `DEFAULT_STOP_TIMEOUT` and the child is still running, force-kill it
+
 ### Model management API (router mode)

 Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)
@@ -198,7 +198,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
+| `--api-key-file FNAME` | path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
 | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
@@ -12,6 +12,7 @@
 #include <random>
 #include <sstream>
 #include <fstream>
+#include <limits>

 json format_error_response(const std::string & message, const enum error_type type) {
    std::string type_str;
@@ -1238,7 +1239,7 @@ json format_response_rerank(
 // other utils
 //

-std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx, size_t n_top) {
    std::vector<llama_token_data> cur;

    const auto * logits = llama_get_logits_ith(ctx, idx);
@@ -1257,21 +1258,34 @@ std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int i
        }
    }

-    // sort tokens by logits
-    std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
-        return a.logit > b.logit;
-    });
+    // sort tokens by logits (partial: only the leading `n_top` need ordering)
+    if (n_top > cur.size()) {
+        n_top = cur.size();
+    }
+    if (n_top > 0) {
+        std::partial_sort(cur.begin(), cur.begin() + n_top, cur.end(),
+            [](const llama_token_data & a, const llama_token_data & b) {
+                return a.logit > b.logit;
+            });
+    }

    // apply softmax
-    float max_l = cur[0].logit;
+    float max_l = -std::numeric_limits<float>::infinity();
+    if (n_top > 0) {
+        max_l = cur[0].logit; // partial_sort guarantees the absolute maximum is at index 0
+    } else {
+        for (const auto & t : cur) {
+            max_l = std::max(max_l, t.logit);
+        }
+    }
    float cum_sum = 0.0f;
-    for (size_t i = 0; i < cur.size(); ++i) {
-        float p = expf(cur[i].logit - max_l);
-        cur[i].p = p;
+    for (auto & t : cur) {
+        float p = expf(t.logit - max_l);
+        t.p = p;
        cum_sum += p;
    }
-    for (size_t i = 0; i < cur.size(); ++i) {
-        cur[i].p /= cum_sum;
+    for (auto & t : cur) {
+        t.p /= cum_sum;
    }

    return cur;
@@ -326,7 +326,7 @@ json format_response_rerank(
 // other utils
 //

-std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx, size_t n_top);

 std::string safe_json_to_str(const json & data);

@@ -63,11 +63,6 @@ enum slot_state {
    SLOT_STATE_GENERATING,
 };

-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-};
-
 struct server_slot {
    int id;

@@ -773,6 +768,8 @@ public:
    // note: chat_params must not be refreshed upon existing sleeping state
    server_chat_params chat_params;

+    server_state_callback_t callback_state = [](server_state, json) -> void {};
+
    server_context_impl() {
        mtmd_helper_log_set(common_log_default_callback, nullptr);
    }
@@ -825,8 +822,7 @@ private:

    server_metrics metrics;

-    json json_ui_settings = json::object();    // Primary: new name
-    json json_webui_settings = json::object();    // Deprecated: use json_ui_settings instead (kept for compat)
+    json json_ui_settings = json::object();

    // Necessary similarity of prompt for slot selection
    float slot_prompt_similarity = 0.0f;
@@ -1245,8 +1241,8 @@ private:
        if (!params_base.model_alias.empty()) {
            // backward compat: use first alias as model name
            model_name = *params_base.model_alias.begin();
-        } else if (!params_base.model.name.empty()) {
-            model_name = params_base.model.name;
+        } else if (!params_base.model.get_name().empty()) {
+            model_name = params_base.model.get_name();
        } else {
            // fallback: derive model name from file name
            auto model_path = std::filesystem::path(params_base.model.path);
@@ -1308,7 +1304,6 @@ private:
                try {
                    json json_settings = json::parse(cfg);
                    json_ui_settings = json_settings;
-                    json_webui_settings = json_settings; // deprecated: keep in sync
                } catch (const std::exception & e) {
                    SRV_ERR("%s: failed to parse UI config: %s\n", __func__, e.what());
                    return false;
@@ -1826,8 +1821,7 @@ private:
                });
            }
        } else {
-            // TODO: optimize this with min-p optimization
-            std::vector<llama_token_data> cur = get_token_probabilities(ctx_tgt, idx);
+            std::vector<llama_token_data> cur = get_token_probabilities(ctx_tgt, idx, n_probs_request);
            const size_t max_probs = cur.size();
            const size_t n_probs = std::min(max_probs, n_probs_request);

@@ -3687,7 +3681,6 @@ server_context_meta server_context::get_meta() const {
        /* has_inp_audio          */ impl->chat_params.allow_audio,
        /* has_inp_video          */ impl->chat_params.allow_video,
        /* json_ui_settings       */ impl->json_ui_settings,
-        /* json_webui_settings    */ impl->json_webui_settings,  // Deprecated
        /* slot_n_ctx             */ impl->get_slot_n_ctx(),
        /* pooling_type           */ llama_pooling_type(impl->ctx_tgt),

@@ -3738,8 +3731,11 @@ struct server_res_generator : server_http_res {
    }
 };

-void server_context::on_sleeping_changed(std::function<void(bool)> callback) {
-    impl->queue_tasks.on_sleeping_state(std::move(callback));
+void server_context::set_state_callback(server_state_callback_t callback) {
+    impl->callback_state = std::move(callback);
+    impl->queue_tasks.on_sleeping_state([this](bool sleeping) {
+        impl->callback_state(sleeping ? SERVER_STATE_SLEEPING : SERVER_STATE_READY, {});
+    });
 }

 // compute the number of tokens before the last user message in the prompt
@@ -4300,12 +4296,8 @@ void server_routes::init_routes() {
            { "endpoint_slots",              params.endpoint_slots },
            { "endpoint_props",              params.endpoint_props },
            { "endpoint_metrics",            params.endpoint_metrics },
-            // New keys
            { "ui",                          params.ui },
            { "ui_settings",                 meta->json_ui_settings },
-            // Deprecated: use ui/ui_settings instead (kept for backward compat)
-            { "webui",                       params.ui },
-            { "webui_settings",              meta->json_ui_settings },
            { "chat_template",               tmpl_default },
            { "chat_template_caps",          meta->chat_template_caps },
            { "bos_token",                   meta->bos_token_str },
@@ -22,8 +22,7 @@ struct server_context_meta {
    bool has_inp_image;
    bool has_inp_audio;
    bool has_inp_video;
-    json json_ui_settings;            // Primary: new name
-    json json_webui_settings;            // Deprecated: use json_ui_settings instead (kept for backward compat)
+    json json_ui_settings;
    int slot_n_ctx;
    enum llama_pooling_type pooling_type;

@@ -53,6 +52,31 @@ struct server_context_meta {
    uint64_t model_size;
 };

+enum server_state {
+    // SERVER_STATE_DOWNLOADING,
+    SERVER_STATE_LOADING,
+    SERVER_STATE_READY,
+    SERVER_STATE_SLEEPING,
+};
+
+static std::string server_state_to_str(server_state state) {
+    switch (state) {
+        case SERVER_STATE_LOADING:     return "loading";
+        case SERVER_STATE_READY:       return "ready";
+        case SERVER_STATE_SLEEPING:    return "sleeping";
+        default: GGML_ASSERT(false && "invalid server_state");
+    }
+}
+
+static server_state server_state_from_str(const std::string & str) {
+    if (str == "loading")     return SERVER_STATE_LOADING;
+    if (str == "ready")       return SERVER_STATE_READY;
+    if (str == "sleeping")    return SERVER_STATE_SLEEPING;
+    GGML_ASSERT(false && "invalid server_state string");
+}
+
+using server_state_callback_t = std::function<void(server_state, json /* payload */)>;
+
 struct server_context {
    std::unique_ptr<server_context_impl> impl;

@@ -80,9 +104,8 @@ struct server_context {
    // not thread-safe, should only be used from the main thread
    server_context_meta get_meta() const;

-    // register a callback to be called when sleeping state changes
-    // must be set before load_model() is called
-    void on_sleeping_changed(std::function<void(bool)> callback);
+    // note: must be set before load_model() is called
+    void set_state_callback(server_state_callback_t callback);
 };


@@ -1,5 +1,6 @@
 #include "server-common.h"
 #include "server-models.h"
+#include "server-context.h"

 #include "build-info.h"
 #include "preset.h"
@@ -44,9 +45,7 @@ extern char **environ;
 #define DEFAULT_STOP_TIMEOUT 10 // seconds

 #define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
-#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
-#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
-#define CMD_CHILD_TO_ROUTER_INFO  "cmd_child_to_router:info:" // followed by json string
+#define CMD_CHILD_TO_ROUTER_STATE "cmd_child_to_router:state:" // followed by json string

 // address for child process, this is needed because router may run on 0.0.0.0
 // ref: https://github.com/ggml-org/llama.cpp/issues/17862
@@ -904,12 +903,8 @@ void server_models::load(const std::string & name) {
                while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) {
                    LOG("[%5d] %s", port, buffer);
                    std::string str(buffer);
-                    if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
-                        this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
-                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
-                        this->update_loaded_info(name, str);
-                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
-                        this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
+                    if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_STATE)) {
+                        this->handle_child_state(name, str);
                    }
                }
            } else {
@@ -976,7 +971,10 @@ void server_models::load(const std::string & name) {
        subprocess_destroy(&child_proc->get());

        // update status and exit code
-        this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
+        this->update_status(name, {
+            SERVER_MODEL_STATUS_UNLOADED,
+            exit_code
+        });
        SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
    });

@@ -1016,7 +1014,8 @@ struct server_models_download_res : public common_download_callback {
            common_download_model(model, opts);
            is_ok = true;
        } catch (const std::exception & e) {
-            SRV_ERR("download failed for model name=%s: %s\n", model.name.c_str(), e.what());
+            auto model_name = model.get_name();
+            SRV_ERR("download failed for model name=%s: %s\n", model_name.c_str(), e.what());
            is_ok = false;
        }
        return is_ok;
@@ -1036,7 +1035,7 @@ struct server_models_download_res : public common_download_callback {
 };

 void server_models::download(common_params_model && model, common_download_opts && opts) {
-    std::string name = model.name;
+    std::string name = model.get_name();
    GGML_ASSERT(name == model.hf_repo);

    std::unique_lock<std::mutex> lk(mutex);
@@ -1064,9 +1063,10 @@ void server_models::download(common_params_model && model, common_download_opts
    inst.th = std::thread([this, dl = std::move(dl)]() {
        dl->opts.callback = dl.get();
        bool ok = dl->run();
+        auto model_name = dl->model.get_name();
        SRV_INF("download finished for model name=%s with status=%s\n",
-                    dl->model.name.c_str(), ok ? "success" : "failure");
-        update_download_progress(dl->model.name, {}, true, ok);
+                    model_name.c_str(), ok ? "success" : "failure");
+        update_download_progress(model_name, {}, true, ok);
        // need_reload is set inside update_download_progress under the mutex;
        // the next load_models() call will clean up this instance
    });
@@ -1130,21 +1130,27 @@ void server_models::unload_all() {
    }
 }

-void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
+void server_models::update_status(const std::string & name, const update_status_args & args) {
    std::unique_lock<std::mutex> lk(mutex);
    auto it = mapping.find(name);
    if (it != mapping.end()) {
        auto & meta = it->second.meta;
-        meta.status    = status;
-        meta.exit_code = exit_code;
+        meta.status      = args.status;
+        meta.exit_code   = args.exit_code;
+        if (!args.loaded_info.is_null()) {
+            meta.loaded_info = args.loaded_info;
+        }
    }
    // broadcast status change to SSE
    {
        json data = {
-            {"status", server_model_status_to_string(status)},
+            {"status", server_model_status_to_string(args.status)},
        };
-        if (status == SERVER_MODEL_STATUS_UNLOADED) {
-            data["exit_code"] = exit_code;
+        if (args.status == SERVER_MODEL_STATUS_UNLOADED) {
+            data["exit_code"] = args.exit_code;
+        }
+        if (!args.loaded_info.is_null()) {
+            data["info"] = args.loaded_info;
        }
        // note: notify_sse doesn't acquire the lock, so no deadlock here
        notify_sse("status_change", name, data);
@@ -1152,29 +1158,6 @@ void server_models::update_status(const std::string & name, server_model_status
    cv.notify_all();
 }

-void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
-    if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
-        SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
-        return;
-    }
-
-    json info;
-    try {
-        info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
-    } catch (const std::exception & e) {
-        SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
-        return;
-    }
-
-    std::unique_lock<std::mutex> lk(mutex);
-    auto it = mapping.find(name);
-    if (it != mapping.end()) {
-        auto & meta = it->second.meta;
-        meta.loaded_info = info;
-    }
-    cv.notify_all();
-}
-
 void server_models::update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok) {
    json curr;
    {
@@ -1323,21 +1306,54 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
    return proxy;
 }

-bool server_models::is_child_server() {
+void server_models::handle_child_state(const std::string & name, const std::string & raw_input) {
+    server_state state;
+    json payload;
+
+    try {
+        json data = json::parse(raw_input.substr(strlen(CMD_CHILD_TO_ROUTER_STATE)));
+        state = server_state_from_str(json_value(data, "state", std::string()));
+        payload = json_value(data, "payload", json{});
+    } catch (const std::exception & e) {
+        SRV_ERR("failed to parse child state update for name=%s: %s\n", name.c_str(), e.what());
+        return;
+    }
+
+    switch (state) {
+        case SERVER_STATE_LOADING:
+            {
+                // do nothing for now
+                // TODO: report loading progress for first load and wakeup from sleep
+            } break;
+        case SERVER_STATE_READY:
+            {
+                update_status(name, {
+                    SERVER_MODEL_STATUS_LOADED,
+                    0,
+                    // note: payload can be empty if this is a wakeup from sleep
+                    payload.size() > 0 ? payload : nullptr
+                });
+            } break;
+        case SERVER_STATE_SLEEPING:
+            {
+                update_status(name, { SERVER_MODEL_STATUS_SLEEPING });
+            } break;
+        default:
+            // should never happen, but just in case
+            GGML_ASSERT(false && "unexpected state from child server");
+    }
+}
+
+//
+// server_child
+//
+
+bool server_child::is_child() {
    const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
    return router_port != nullptr;
 }

-std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
-    // send a notification to the router server that a model instance is ready
-    common_log_pause(common_log_main());
-    fflush(stdout);
-    fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
-    fflush(stdout);
-    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
-    fflush(stdout);
-    common_log_resume(common_log_main());
-
+std::thread server_child::setup(const std::function<void(int)> & shutdown_handler) {
    // setup thread for monitoring stdin
    return std::thread([shutdown_handler]() {
        // wait for EOF on stdin
@@ -1363,10 +1379,14 @@ std::thread server_models::setup_child_server(const std::function<void(int)> & s
    });
 }

-void server_models::notify_router_sleeping_state(bool is_sleeping) {
+void server_child::notify_to_router(const std::string & state, const json & payload) {
+    json data = {
+        {"state", state},
+        {"payload", payload},
+    };
    common_log_pause(common_log_main());
    fflush(stdout);
-    fprintf(stdout, "%s\n", is_sleeping ? CMD_CHILD_TO_ROUTER_SLEEP : CMD_CHILD_TO_ROUTER_READY);
+    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_STATE, safe_json_to_str(data).c_str());
    fflush(stdout);
    common_log_resume(common_log_main());
 }
@@ -1474,7 +1494,6 @@ void server_models_routes::init_routes() {
                }},
                // New key
                {"ui_settings",          ui_settings},
-                {"webui_settings",       webui_settings},
                {"build_info",           std::string(llama_build_info())},
                {"cors_proxy_enabled",   params.ui_mcp_proxy},
            });
@@ -1645,7 +1664,6 @@ void server_models_routes::init_routes() {
        common_params_model model;
        common_download_opts opts;

-        model.name           = name;
        model.hf_repo        = name;
        opts.bearer_token    = params.hf_token;
        opts.download_mmproj = true;
@@ -171,8 +171,12 @@ public:
    void download(common_params_model && model, common_download_opts && opts);

    // update the status of a model instance (thread-safe)
-    void update_status(const std::string & name, server_model_status status, int exit_code);
-    void update_loaded_info(const std::string & name, std::string & raw_info);
+    struct update_status_args {
+        server_model_status status;
+        int exit_code = 0; // only valid if status == UNLOADED
+        json loaded_info = nullptr;
+    };
+    void update_status(const std::string & name, const update_status_args & args);
    void update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok = true);

    // remove a cache model from disk and update the list (thread-safe)
@@ -193,21 +197,32 @@ public:
    // proxy an HTTP request to the model instance
    server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);

+    // handle message sent from server_child::notify_to_router()
+    // raw input must starts with CMD_CHILD_TO_ROUTER_STATE, followed by a JSON string
+    // this function is not thread-safe, must be called from instance's monitoring thread
+    // payload per state:
+    //     state = loading     -> payload = {} (TODO: add progress info)
+    //     state = ready       -> payload = model_info (json), or {} if wakeup from sleeping
+    //     state = sleeping    -> payload = {}
+    void handle_child_state(const std::string & name, const std::string & raw_input);
+};
+
+struct server_child {
    // return true if the current process is a child server instance
-    static bool is_child_server();
+    bool is_child();

-    // notify the router server that a model instance is ready
+    // register the shutdown_handler to be called by the router
    // return the monitoring thread (to be joined by the caller)
-    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
+    std::thread setup(const std::function<void(int)> & shutdown_handler);

-    // notify the router server that the sleeping state has changed
-    static void notify_router_sleeping_state(bool sleeping);
+    // notify router server for status changes (e.g. loading, downloading, sleeping, etc.)
+    // message will be handled by server_models::handle_child_state() on the router side
+    void notify_to_router(const std::string & state_name, const json & payload);
 };

 struct server_models_routes {
    common_params params;
    json ui_settings = json::object();     // Primary: new name
-    json webui_settings = json::object();  // Deprecated: use ui_settings (kept for compat)
    std::atomic<bool> stopping = false;    // for graceful disconnecting SSE clients during shutdown
    server_models models;
    server_models_routes(const common_params & params, int argc, char ** argv)
@@ -217,7 +232,6 @@ struct server_models_routes {
            try {
                json json_settings = json::parse(cfg);
                ui_settings = json_settings;
-                webui_settings = json_settings;  // Deprecated: keep in sync
            } catch (const std::exception & e) {
                LOG_ERR("%s: failed to parse UI config: %s\n", __func__, e.what());
                throw;
@@ -90,8 +90,10 @@ int llama_server(int argc, char ** argv) {
    llama_numa_init(params.numa);

    // router server never loads a model and must not touch the GPU
+    const bool is_router_server = params.model.path.empty()
+                               && params.model.hf_repo.empty();
+
    // skip device enumeration so the CUDA primary context stays uncreated
-    const bool is_router_server = params.model.path.empty();
    common_params_print_info(params, !is_router_server);

    if (!is_router_server) {
@@ -113,8 +115,9 @@ int llama_server(int argc, char ** argv) {
    }

    // for consistency between server router mode and single-model mode, we set the same model name as alias
-    if (params.model_alias.empty() && !params.model.name.empty()) {
-        params.model_alias.insert(params.model.name);
+    auto model_name = params.model.get_name();
+    if (params.model_alias.empty() && !model_name.empty()) {
+        params.model_alias.insert(model_name);
    }

    // struct that contains llama context and inference
@@ -255,6 +258,7 @@ int llama_server(int argc, char ** argv) {
    // Start the server
    //

+    server_child child; // only used in non-router mode
    std::function<void()> clean_up;

    if (is_router_server) {
@@ -300,15 +304,16 @@ int llama_server(int argc, char ** argv) {
            return 1;
        }

-        // load the model
-        SRV_INF("%s", "loading model\n");
-
-        if (server_models::is_child_server()) {
-            ctx_server.on_sleeping_changed([&](bool sleeping) {
-                server_models::notify_router_sleeping_state(sleeping);
+        // setup communication child --> router if necessary
+        if (child.is_child()) {
+            ctx_server.set_state_callback([&](server_state state, json payload) {
+                child.notify_to_router(server_state_to_str(state), payload);
            });
        }

+        // load the model
+        SRV_INF("%s", "loading model\n");
+
        if (!ctx_server.load_model(params)) {
            clean_up();
            if (ctx_http.thread.joinable()) {
@@ -365,9 +370,9 @@ int llama_server(int argc, char ** argv) {

        // optionally, notify router server that this instance is ready
        std::thread monitor_thread;
-        if (server_models::is_child_server()) {
-            json model_info = routes.get_model_info();
-            monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
+        if (child.is_child()) {
+            monitor_thread = child.setup(shutdown_handler);
+            child.notify_to_router(server_state_to_str(SERVER_STATE_READY), routes.get_model_info());
        }

        // this call blocks the main thread until queue_tasks.terminate() is called
@@ -79,9 +79,9 @@ def test_load_split_model():
    assert match_regex("(little|girl)+", res.body["content"])


-def test_no_webui():
+def test_no_ui():
    global server
-    # default: webui enabled
+    # default: UI enabled
    server.start()
    url = f"http://{server.server_host}:{server.server_port}"
    res = requests.get(url)
@@ -89,8 +89,8 @@ def test_no_webui():
    assert "<!doctype html>" in res.text
    server.stop()

-    # with --no-webui
-    server.no_webui = True
+    # with --no-ui, the UI should be disabled
+    server.no_ui = True
    server.start()
    res = requests.get(url)
    assert res.status_code == 404
@@ -12,7 +12,7 @@ def create_server():

 def test_mcp_no_proxy():
    global server
-    server.webui_mcp_proxy = False
+    server.ui_mcp_proxy = False
    server.start()

    res = server.make_request("GET", "/cors-proxy")
@@ -21,7 +21,7 @@ def test_mcp_no_proxy():

 def test_mcp_proxy():
    global server
-    server.webui_mcp_proxy = True
+    server.ui_mcp_proxy = True
    server.start()

    url = f"http://{server.server_host}:{server.server_port}/cors-proxy?url=http://example.com"
@@ -32,7 +32,7 @@ def test_mcp_proxy():

 def test_mcp_proxy_custom_port():
    global server
-    server.webui_mcp_proxy = True
+    server.ui_mcp_proxy = True
    server.start()

    # try getting the server's models API via the proxy
@@ -94,7 +94,7 @@ class ServerProcess:
    enable_ctx_shift: int | None = False
    spec_draft_n_min: int | None = None
    spec_draft_n_max: int | None = None
-    no_webui: bool | None = None
+    no_ui: bool | None = None
    jinja: bool | None = None
    reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
    reasoning: Literal['on', 'off', 'auto'] | None = None
@@ -107,7 +107,7 @@ class ServerProcess:
    cache_ram: int | None = None
    no_cache_idle_slots: bool = False
    log_path: str | None = None
-    webui_mcp_proxy: bool = False
+    ui_mcp_proxy: bool = False
    backend_sampling: bool = False
    gcp_compat: bool = False

@@ -225,8 +225,8 @@ class ServerProcess:
            server_args.extend(["--spec-draft-n-max", self.spec_draft_n_max])
        if self.spec_draft_n_min:
            server_args.extend(["--spec-draft-n-min", self.spec_draft_n_min])
-        if self.no_webui:
-            server_args.append("--no-webui")
+        if self.no_ui:
+            server_args.append("--no-ui")
        if self.no_models_autoload:
            server_args.append("--no-models-autoload")
        if self.jinja:
@@ -251,8 +251,8 @@ class ServerProcess:
            server_args.extend(["--cache-ram", self.cache_ram])
        if self.no_cache_idle_slots:
            server_args.append("--no-cache-idle-slots")
-        if self.webui_mcp_proxy:
-            server_args.append("--webui-mcp-proxy")
+        if self.ui_mcp_proxy:
+            server_args.append("--ui-mcp-proxy")
        if self.backend_sampling:
            server_args.append("--backend_sampling")
        if self.gcp_compat:
Author	SHA1	Message	Date
Adrien Gallouët	37a77fb057	ggml : optimize AMX (#24806 ) Flatten the partition over n_batch * M so every thread participates in the quantization \| CPU \| Model \| Test \| t/s OLD \| t/s NEW \| Speedup \| \|:--------------------------------\|:------------------------------\|:-------\|----------:\|----------:\|----------:\| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_NL - 4.5 bpw \| pp512 \| 730.71 \| 779.86 \| 1.07 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_NL - 4.5 bpw \| tg128 \| 87.88 \| 86.79 \| 0.99 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_XS - 4.25 bpw \| pp512 \| 725.09 \| 1023.31 \| 1.41 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_XS - 4.25 bpw \| tg128 \| 83.64 \| 83.62 \| 1.00 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_0 \| pp512 \| 820.51 \| 924.05 \| 1.13 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_0 \| tg128 \| 90.59 \| 92.46 \| 1.02 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_1 \| pp512 \| 776.88 \| 872.79 \| 1.12 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_1 \| tg128 \| 89.39 \| 90.94 \| 1.02 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_M \| pp512 \| 719.28 \| 1009.27 \| 1.40 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_M \| tg128 \| 80.62 \| 80.86 \| 1.00 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_S \| pp512 \| 732.29 \| 1077.29 \| 1.47 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_S \| tg128 \| 86.42 \| 83.53 \| 0.97 \| Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-20 13:43:06 +03:00
Sigbjørn Skjæret	f4043fec01	convert : more consistent handling of rope_parameters (#24833 )	2026-06-20 13:42:36 +03:00
Masashi Yoshimura	f449e05537	ggml-webgpu: add adapter toggles for F16 on Vulkan + NVIDIA	2026-06-20 08:12:32 +09:00
Xuan-Son Nguyen	2b686a9120	server: refactor child --> router communication (#24821 ) * server: refactor child --> router communication * fix wakeup case * add docs * improve update_status() * nits	2026-06-20 01:02:26 +02:00
Adrien Gallouët	4b48a53b6c	server : optimize get_token_probabilities (#24796 ) Use std::partial_sort to order only the requested top-n tokens instead of the full vocabulary logprobs sort: vocab=128000 n_top=0 iters=100 full sort: 8555.6 us/op partial sort: 704.3 us/op Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-19 23:26:54 +02:00
Xuan-Son Nguyen	e475fa2b5f	mtmd, arg: fix utf8 handling on windows (#24779 ) * mtmd, arg: fix utf8 handling on windows * also fix ggml_fopen * fix build fail * also fix CLI	2026-06-19 22:28:38 +02:00
Xuan-Son Nguyen	175147e8f6	server: remove all internal mentions about "webui" (#24817 )	2026-06-19 22:12:46 +02:00
Mikolaj Kucharski	fabde3bf51	arg: Add comment line support to --api-key-file (#23168 )	2026-06-19 17:33:54 +02:00