mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-20 12:47:39 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 37a77fb057 | |||
| f4043fec01 | |||
| f449e05537 | |||
| 2b686a9120 | |||
| 4b48a53b6c | |||
| e475fa2b5f | |||
| 175147e8f6 | |||
| fabde3bf51 |
+40
-4
@@ -17,6 +17,7 @@
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <shellapi.h>
|
||||
#endif
|
||||
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
@@ -302,7 +303,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
|
||||
if (!model.docker_repo.empty()) {
|
||||
model.path = common_docker_resolve_model(model.docker_repo);
|
||||
model.name = model.docker_repo;
|
||||
} else if (!model.hf_repo.empty()) {
|
||||
// If -m was used with -hf, treat the model "path" as the hf_file to download
|
||||
if (model.hf_file.empty() && !model.path.empty()) {
|
||||
@@ -322,7 +322,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
throw std::runtime_error("failed to download model from Hugging Face");
|
||||
}
|
||||
|
||||
model.name = model.hf_repo;
|
||||
model.path = download_result.model_path;
|
||||
|
||||
if (!download_result.mmproj_path.empty()) {
|
||||
@@ -893,7 +892,44 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
struct utf8_argv {
|
||||
std::vector<std::string> buf;
|
||||
std::vector<char*> ptrs;
|
||||
};
|
||||
|
||||
static utf8_argv make_utf8_argv() {
|
||||
utf8_argv out;
|
||||
int wargc = 0;
|
||||
LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc);
|
||||
if (!wargv) return out;
|
||||
|
||||
out.buf.reserve(wargc);
|
||||
for (int i = 0; i < wargc; ++i) {
|
||||
int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr);
|
||||
if (n <= 0) { out.buf.emplace_back(); continue; }
|
||||
auto& s = out.buf.emplace_back();
|
||||
s.resize(static_cast<size_t>(n - 1));
|
||||
(void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr);
|
||||
}
|
||||
LocalFree(wargv);
|
||||
|
||||
out.ptrs.reserve(out.buf.size() + 1);
|
||||
for (auto& s : out.buf) out.ptrs.push_back(s.data());
|
||||
out.ptrs.push_back(nullptr);
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
#ifdef _WIN32
|
||||
auto utf8 = make_utf8_argv();
|
||||
if (!utf8.ptrs.empty()) {
|
||||
argc = static_cast<int>(utf8.buf.size());
|
||||
argv = utf8.ptrs.data();
|
||||
}
|
||||
#endif
|
||||
|
||||
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
||||
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
||||
|
||||
@@ -2911,7 +2947,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
||||
add_opt(common_arg(
|
||||
{"--api-key-file"}, "FNAME",
|
||||
"path to file containing API keys (default: none)",
|
||||
"path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream key_file(value);
|
||||
if (!key_file) {
|
||||
@@ -2919,7 +2955,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
std::string key;
|
||||
while (std::getline(key_file, key)) {
|
||||
if (!key.empty()) {
|
||||
if (!key.empty() && key[0] != '#') {
|
||||
params.api_keys.push_back(key);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1074,6 +1074,18 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
|
||||
return files;
|
||||
}
|
||||
|
||||
std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) {
|
||||
#ifdef _WIN32
|
||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
|
||||
if (!wlen) { return std::ifstream(); }
|
||||
std::vector<wchar_t> wfname(wlen);
|
||||
(void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
|
||||
return std::ifstream(wfname.data(), mode);
|
||||
#else
|
||||
return std::ifstream(fname, mode);
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
|
||||
+13
-1
@@ -295,7 +295,16 @@ struct common_params_model {
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
|
||||
|
||||
std::string get_name() {
|
||||
if (!hf_repo.empty()) {
|
||||
return hf_repo;
|
||||
}
|
||||
if (!docker_repo.empty()) {
|
||||
return docker_repo;
|
||||
}
|
||||
return path;
|
||||
}
|
||||
};
|
||||
|
||||
// draft-model-based speculative decoding parameters
|
||||
@@ -842,6 +851,9 @@ struct common_file_info {
|
||||
};
|
||||
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
||||
|
||||
// fs open, also handle UTF8 on Windows
|
||||
std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode);
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
|
||||
@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||
|
||||
+7
-1
@@ -1119,8 +1119,10 @@ class TextModel(ModelBase):
|
||||
|
||||
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
|
||||
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
||||
partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
|
||||
original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)
|
||||
|
||||
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||
# Ensure global params are mirrored in rope_parameters
|
||||
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||
if local_rope_theta is not None:
|
||||
self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
|
||||
@@ -1128,6 +1130,10 @@ class TextModel(ModelBase):
|
||||
self.rope_parameters["rope_theta"] = rope_theta
|
||||
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
|
||||
self.rope_parameters["rope_type"] = rope_type
|
||||
if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
|
||||
self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
|
||||
if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
|
||||
self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings
|
||||
|
||||
@classmethod
|
||||
def __init_subclass__(cls):
|
||||
|
||||
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
|
||||
rope_dim = self.hparams["attention_dim"]
|
||||
else:
|
||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
rope_freq = 10000
|
||||
if "rope_ratio" in self.hparams:
|
||||
|
||||
+1
-1
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
|
||||
factor = rope_params.get("factor", 8.0)
|
||||
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):
|
||||
|
||||
assert (hparams["activation_function"] == "silu")
|
||||
|
||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
||||
rotary_factor = self.rope_parameters.get("partial_rotary_factor")
|
||||
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
|
||||
@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
|
||||
factor = rope_params.get("factor", 8.0)
|
||||
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
|
||||
factor = rope_params.get("factor", 16.0)
|
||||
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
+1
-1
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
|
||||
self.gguf_writer.add_head_count_kv(value_arr)
|
||||
|
||||
# handle n_rot differently for global vs swa layers
|
||||
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
|
||||
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
|
||||
self.gguf_writer.add_rope_dimension_count(n_rot_full)
|
||||
|
||||
+2
-2
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
|
||||
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
)
|
||||
self.gguf_writer.add_rope_dimension_count(
|
||||
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
|
||||
int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
|
||||
)
|
||||
|
||||
# MoE parameters - Use only routed expert count (shared experts handled separately)
|
||||
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
rope_dim = self.hparams["qk_rope_head_dim"]
|
||||
partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))
|
||||
|
||||
# NextN/MTP prediction layers
|
||||
|
||||
+1
-1
@@ -289,7 +289,7 @@ class LlamaModel(TextModel):
|
||||
factor = rope_params.get("factor", 8.0)
|
||||
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
+1
-1
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
|
||||
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
||||
|
||||
rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
|
||||
rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
|
||||
|
||||
+6
-10
@@ -32,11 +32,9 @@ class MiniCPMModel(TextModel):
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if rope_scaling is not None:
|
||||
long_factors = rope_scaling.get('long_factor', None)
|
||||
short_factors = rope_scaling.get('short_factor', None)
|
||||
|
||||
long_factors = self.rope_parameters.get('long_factor')
|
||||
short_factors = self.rope_parameters.get('short_factor')
|
||||
if long_factors or short_factors:
|
||||
if long_factors is None or short_factors is None:
|
||||
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
@@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel):
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if rope_scaling is not None:
|
||||
long_factors = self.rope_parameters.get('long_factor')
|
||||
short_factors = self.rope_parameters.get('short_factor')
|
||||
if long_factors or short_factors:
|
||||
rope_dims = self.hparams["qk_rope_head_dim"]
|
||||
|
||||
long_factors = rope_scaling.get('long_factor', None)
|
||||
short_factors = rope_scaling.get('short_factor', None)
|
||||
|
||||
if long_factors is None or short_factors is None:
|
||||
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
|
||||
@@ -125,17 +125,18 @@ class NemotronModel(TextModel):
|
||||
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
||||
|
||||
# * Partial RoPE
|
||||
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
|
||||
rot_pct = self.rope_parameters["partial_rotary_factor"]
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||
|
||||
# * RopeScaling for Nemotron
|
||||
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
|
||||
factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
|
||||
if factor is None:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
else:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
|
||||
self.gguf_writer.add_rope_scaling_factor(factor)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
|
||||
|
||||
+9
-11
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PHI2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
rot_pct = self.find_hparam(["partial_rotary_factor"])
|
||||
rot_pct = self.rope_parameters["partial_rotary_factor"]
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
|
||||
@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
|
||||
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
||||
rms_eps = self.find_hparam(["rms_norm_eps"])
|
||||
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
||||
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
|
||||
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
rope_dims = int(rot_pct * n_embd) // n_head
|
||||
|
||||
self.gguf_writer.add_context_length(max_pos_embds)
|
||||
@@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel):
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
||||
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
|
||||
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
rope_dims = int(rot_pct * n_embd) // n_head
|
||||
|
||||
# write rope scaling for long context (128k) model
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if rope_scaling is None:
|
||||
long_factors = self.rope_parameters.get('long_factor')
|
||||
short_factors = self.rope_parameters.get('short_factor')
|
||||
if not long_factors:
|
||||
return
|
||||
|
||||
scale = max_pos_embds / orig_max_pos_embds
|
||||
|
||||
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
|
||||
rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
|
||||
if len(rope_scaling_type) == 0:
|
||||
raise KeyError('Missing the required key rope_scaling.type')
|
||||
|
||||
@@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel):
|
||||
|
||||
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
|
||||
|
||||
long_factors = rope_scaling.get('long_factor', None)
|
||||
short_factors = rope_scaling.get('short_factor', None)
|
||||
|
||||
if long_factors is None or short_factors is None:
|
||||
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
|
||||
+1
-1
@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
|
||||
self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
|
||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
|
||||
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
||||
rotary_factor = self.rope_parameters["partial_rotary_factor"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||
|
||||
+1
-1
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
|
||||
factor = float(rope_params.get("factor", 8.0))
|
||||
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
|
||||
high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
|
||||
old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
|
||||
old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
@@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
|
||||
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
|
||||
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
|
||||
|
||||
parallel_for_ggml(params, n_batch, [&](int begin, int end) {
|
||||
for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
|
||||
parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
|
||||
for (int idx = begin; idx < end; ++idx) {
|
||||
int batch_idx = idx / M;
|
||||
int m = idx % M;
|
||||
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
|
||||
const float * A_data = (const float *)((const char *)src1->data + src1_offset);
|
||||
char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
|
||||
|
||||
for (int m = 0; m < M; ++m) {
|
||||
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
|
||||
}
|
||||
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -3788,7 +3788,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
|
||||
ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
|
||||
}
|
||||
|
||||
static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
static void ggml_backend_webgpu_request_adapter(wgpu::Instance & instance, wgpu::Adapter & adapter) {
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
|
||||
#ifndef __EMSCRIPTEN__
|
||||
@@ -3800,17 +3800,20 @@ static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
options.nextInChain = &adapterTogglesDesc;
|
||||
#endif
|
||||
|
||||
ctx->webgpu_global_ctx->instance.WaitAny(
|
||||
ctx->webgpu_global_ctx->instance.RequestAdapter(
|
||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
ctx->webgpu_global_ctx->adapter = std::move(adapter);
|
||||
}),
|
||||
UINT64_MAX);
|
||||
instance.WaitAny(instance.RequestAdapter(
|
||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
adapter = std::move(_adapter);
|
||||
}),
|
||||
UINT64_MAX);
|
||||
}
|
||||
|
||||
static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, ctx->webgpu_global_ctx->adapter);
|
||||
GGML_ASSERT(ctx->webgpu_global_ctx->adapter != nullptr);
|
||||
|
||||
ctx->webgpu_global_ctx->adapter.GetLimits(&ctx->webgpu_global_ctx->capabilities.limits);
|
||||
@@ -4543,20 +4546,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
|
||||
// Probe for adapter support
|
||||
wgpu::Adapter adapter;
|
||||
if (ctx->webgpu_global_ctx->instance != nullptr) {
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
|
||||
// probe for adapter support
|
||||
ctx->webgpu_global_ctx->instance.WaitAny(
|
||||
ctx->webgpu_global_ctx->instance.RequestAdapter(
|
||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
adapter = std::move(_adapter);
|
||||
}),
|
||||
UINT64_MAX);
|
||||
ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, adapter);
|
||||
}
|
||||
|
||||
// WebGPU backend requires f16 support and, on native, implicit device synchronization.
|
||||
|
||||
+7
-10
@@ -600,18 +600,15 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
|
||||
// convert fname (UTF-8)
|
||||
wchar_t * wfname = ggml_mbstowcs(fname);
|
||||
if (wfname) {
|
||||
// convert mode (ANSI)
|
||||
wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
|
||||
wchar_t * wmode_p = wmode;
|
||||
do {
|
||||
*wmode_p++ = (wchar_t)*mode;
|
||||
} while (*mode++);
|
||||
|
||||
// open file
|
||||
file = _wfopen(wfname, wmode);
|
||||
// convert mode (UTF-8)
|
||||
wchar_t * wmode = ggml_mbstowcs(mode);
|
||||
if (wmode) {
|
||||
// open file
|
||||
file = _wfopen(wfname, wmode);
|
||||
GGML_FREE(wmode);
|
||||
}
|
||||
|
||||
GGML_FREE(wfname);
|
||||
GGML_FREE(wmode);
|
||||
}
|
||||
|
||||
return file;
|
||||
|
||||
+1
-1
@@ -202,7 +202,7 @@ struct cli_context {
|
||||
|
||||
// TODO: support remote files in the future (http, https, etc)
|
||||
std::string load_input_file(const std::string & fname, bool is_media) {
|
||||
std::ifstream file(fname, std::ios::binary);
|
||||
std::ifstream file = fs_open_ifstream(fname, std::ios::binary);
|
||||
if (!file) {
|
||||
return "";
|
||||
}
|
||||
|
||||
@@ -13,6 +13,14 @@
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <fstream>
|
||||
|
||||
#ifdef _WIN32
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Internal header for clip.cpp
|
||||
|
||||
@@ -661,6 +669,22 @@ struct clip_image_f32_batch {
|
||||
// common utils
|
||||
//
|
||||
|
||||
#ifdef _WIN32
|
||||
static std::ifstream open_ifstream_binary(const std::string & fname) {
|
||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
|
||||
if (!wlen) {
|
||||
throw std::runtime_error("failed to convert filename to UTF-16: " + fname);
|
||||
}
|
||||
std::vector<wchar_t> wfname(wlen);
|
||||
(void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
|
||||
return std::ifstream(wfname.data(), std::ios::binary);
|
||||
}
|
||||
#else
|
||||
static std::ifstream open_ifstream_binary(const std::string & fname) {
|
||||
return std::ifstream(fname, std::ios::binary);
|
||||
}
|
||||
#endif
|
||||
|
||||
static std::string string_format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
|
||||
+1
-1
@@ -1752,7 +1752,7 @@ struct clip_model_loader {
|
||||
std::map<std::string, size_t> tensor_offset;
|
||||
std::vector<ggml_tensor *> tensors_to_load;
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
auto fin = open_ifstream_binary(fname);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
@@ -396,6 +396,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
||||
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
// Ctrl+C handling
|
||||
{
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
|
||||
@@ -582,13 +582,29 @@ mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx,
|
||||
}
|
||||
|
||||
mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
|
||||
std::vector<unsigned char> buf;
|
||||
#ifdef _WIN32
|
||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
|
||||
if (!wlen) {
|
||||
LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname);
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
std::vector<wchar_t> wfname(wlen);
|
||||
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wfname.data(), wlen);
|
||||
if (!wlen) {
|
||||
LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname);
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
FILE * f = _wfopen(wfname.data(), L"rb");
|
||||
#else
|
||||
FILE * f = fopen(fname, "rb");
|
||||
#endif
|
||||
if (!f) {
|
||||
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
|
||||
std::vector<unsigned char> buf;
|
||||
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
|
||||
@@ -180,6 +180,17 @@ That requires `JSON.stringify` when formatted to message content:
|
||||
}
|
||||
```
|
||||
|
||||
### Router mode: how child <--> router communicates
|
||||
|
||||
Upon spawning a new child process using `subprocess`, both child and router listen to the stdout/stderr (combined)
|
||||
|
||||
For the direction from child to router:
|
||||
- Generic messages are logs, it will be forwarded to router's stdout
|
||||
- Special state update messages are prefixed by `cmd_child_to_router:state:`, followed by a JSON. See `server_models::handle_child_state` for more
|
||||
|
||||
For the direction from router to child:
|
||||
- When server sends `cmd_router_to_child:exit`, the child should exit gracefully --> if after `DEFAULT_STOP_TIMEOUT` and the child is still running, force-kill it
|
||||
|
||||
### Model management API (router mode)
|
||||
|
||||
Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)
|
||||
|
||||
@@ -198,7 +198,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||
| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||
| `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||
| `--api-key-file FNAME` | path to file containing API keys (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
|
||||
| `--api-key-file FNAME` | path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
|
||||
json format_error_response(const std::string & message, const enum error_type type) {
|
||||
std::string type_str;
|
||||
@@ -1238,7 +1239,7 @@ json format_response_rerank(
|
||||
// other utils
|
||||
//
|
||||
|
||||
std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
|
||||
std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx, size_t n_top) {
|
||||
std::vector<llama_token_data> cur;
|
||||
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
@@ -1257,21 +1258,34 @@ std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int i
|
||||
}
|
||||
}
|
||||
|
||||
// sort tokens by logits
|
||||
std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
// sort tokens by logits (partial: only the leading `n_top` need ordering)
|
||||
if (n_top > cur.size()) {
|
||||
n_top = cur.size();
|
||||
}
|
||||
if (n_top > 0) {
|
||||
std::partial_sort(cur.begin(), cur.begin() + n_top, cur.end(),
|
||||
[](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
}
|
||||
|
||||
// apply softmax
|
||||
float max_l = cur[0].logit;
|
||||
float max_l = -std::numeric_limits<float>::infinity();
|
||||
if (n_top > 0) {
|
||||
max_l = cur[0].logit; // partial_sort guarantees the absolute maximum is at index 0
|
||||
} else {
|
||||
for (const auto & t : cur) {
|
||||
max_l = std::max(max_l, t.logit);
|
||||
}
|
||||
}
|
||||
float cum_sum = 0.0f;
|
||||
for (size_t i = 0; i < cur.size(); ++i) {
|
||||
float p = expf(cur[i].logit - max_l);
|
||||
cur[i].p = p;
|
||||
for (auto & t : cur) {
|
||||
float p = expf(t.logit - max_l);
|
||||
t.p = p;
|
||||
cum_sum += p;
|
||||
}
|
||||
for (size_t i = 0; i < cur.size(); ++i) {
|
||||
cur[i].p /= cum_sum;
|
||||
for (auto & t : cur) {
|
||||
t.p /= cum_sum;
|
||||
}
|
||||
|
||||
return cur;
|
||||
|
||||
@@ -326,7 +326,7 @@ json format_response_rerank(
|
||||
// other utils
|
||||
//
|
||||
|
||||
std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
|
||||
std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx, size_t n_top);
|
||||
|
||||
std::string safe_json_to_str(const json & data);
|
||||
|
||||
|
||||
@@ -63,11 +63,6 @@ enum slot_state {
|
||||
SLOT_STATE_GENERATING,
|
||||
};
|
||||
|
||||
enum server_state {
|
||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
};
|
||||
|
||||
struct server_slot {
|
||||
int id;
|
||||
|
||||
@@ -773,6 +768,8 @@ public:
|
||||
// note: chat_params must not be refreshed upon existing sleeping state
|
||||
server_chat_params chat_params;
|
||||
|
||||
server_state_callback_t callback_state = [](server_state, json) -> void {};
|
||||
|
||||
server_context_impl() {
|
||||
mtmd_helper_log_set(common_log_default_callback, nullptr);
|
||||
}
|
||||
@@ -825,8 +822,7 @@ private:
|
||||
|
||||
server_metrics metrics;
|
||||
|
||||
json json_ui_settings = json::object(); // Primary: new name
|
||||
json json_webui_settings = json::object(); // Deprecated: use json_ui_settings instead (kept for compat)
|
||||
json json_ui_settings = json::object();
|
||||
|
||||
// Necessary similarity of prompt for slot selection
|
||||
float slot_prompt_similarity = 0.0f;
|
||||
@@ -1245,8 +1241,8 @@ private:
|
||||
if (!params_base.model_alias.empty()) {
|
||||
// backward compat: use first alias as model name
|
||||
model_name = *params_base.model_alias.begin();
|
||||
} else if (!params_base.model.name.empty()) {
|
||||
model_name = params_base.model.name;
|
||||
} else if (!params_base.model.get_name().empty()) {
|
||||
model_name = params_base.model.get_name();
|
||||
} else {
|
||||
// fallback: derive model name from file name
|
||||
auto model_path = std::filesystem::path(params_base.model.path);
|
||||
@@ -1308,7 +1304,6 @@ private:
|
||||
try {
|
||||
json json_settings = json::parse(cfg);
|
||||
json_ui_settings = json_settings;
|
||||
json_webui_settings = json_settings; // deprecated: keep in sync
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("%s: failed to parse UI config: %s\n", __func__, e.what());
|
||||
return false;
|
||||
@@ -1826,8 +1821,7 @@ private:
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// TODO: optimize this with min-p optimization
|
||||
std::vector<llama_token_data> cur = get_token_probabilities(ctx_tgt, idx);
|
||||
std::vector<llama_token_data> cur = get_token_probabilities(ctx_tgt, idx, n_probs_request);
|
||||
const size_t max_probs = cur.size();
|
||||
const size_t n_probs = std::min(max_probs, n_probs_request);
|
||||
|
||||
@@ -3687,7 +3681,6 @@ server_context_meta server_context::get_meta() const {
|
||||
/* has_inp_audio */ impl->chat_params.allow_audio,
|
||||
/* has_inp_video */ impl->chat_params.allow_video,
|
||||
/* json_ui_settings */ impl->json_ui_settings,
|
||||
/* json_webui_settings */ impl->json_webui_settings, // Deprecated
|
||||
/* slot_n_ctx */ impl->get_slot_n_ctx(),
|
||||
/* pooling_type */ llama_pooling_type(impl->ctx_tgt),
|
||||
|
||||
@@ -3738,8 +3731,11 @@ struct server_res_generator : server_http_res {
|
||||
}
|
||||
};
|
||||
|
||||
void server_context::on_sleeping_changed(std::function<void(bool)> callback) {
|
||||
impl->queue_tasks.on_sleeping_state(std::move(callback));
|
||||
void server_context::set_state_callback(server_state_callback_t callback) {
|
||||
impl->callback_state = std::move(callback);
|
||||
impl->queue_tasks.on_sleeping_state([this](bool sleeping) {
|
||||
impl->callback_state(sleeping ? SERVER_STATE_SLEEPING : SERVER_STATE_READY, {});
|
||||
});
|
||||
}
|
||||
|
||||
// compute the number of tokens before the last user message in the prompt
|
||||
@@ -4300,12 +4296,8 @@ void server_routes::init_routes() {
|
||||
{ "endpoint_slots", params.endpoint_slots },
|
||||
{ "endpoint_props", params.endpoint_props },
|
||||
{ "endpoint_metrics", params.endpoint_metrics },
|
||||
// New keys
|
||||
{ "ui", params.ui },
|
||||
{ "ui_settings", meta->json_ui_settings },
|
||||
// Deprecated: use ui/ui_settings instead (kept for backward compat)
|
||||
{ "webui", params.ui },
|
||||
{ "webui_settings", meta->json_ui_settings },
|
||||
{ "chat_template", tmpl_default },
|
||||
{ "chat_template_caps", meta->chat_template_caps },
|
||||
{ "bos_token", meta->bos_token_str },
|
||||
|
||||
@@ -22,8 +22,7 @@ struct server_context_meta {
|
||||
bool has_inp_image;
|
||||
bool has_inp_audio;
|
||||
bool has_inp_video;
|
||||
json json_ui_settings; // Primary: new name
|
||||
json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat)
|
||||
json json_ui_settings;
|
||||
int slot_n_ctx;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
@@ -53,6 +52,31 @@ struct server_context_meta {
|
||||
uint64_t model_size;
|
||||
};
|
||||
|
||||
enum server_state {
|
||||
// SERVER_STATE_DOWNLOADING,
|
||||
SERVER_STATE_LOADING,
|
||||
SERVER_STATE_READY,
|
||||
SERVER_STATE_SLEEPING,
|
||||
};
|
||||
|
||||
static std::string server_state_to_str(server_state state) {
|
||||
switch (state) {
|
||||
case SERVER_STATE_LOADING: return "loading";
|
||||
case SERVER_STATE_READY: return "ready";
|
||||
case SERVER_STATE_SLEEPING: return "sleeping";
|
||||
default: GGML_ASSERT(false && "invalid server_state");
|
||||
}
|
||||
}
|
||||
|
||||
static server_state server_state_from_str(const std::string & str) {
|
||||
if (str == "loading") return SERVER_STATE_LOADING;
|
||||
if (str == "ready") return SERVER_STATE_READY;
|
||||
if (str == "sleeping") return SERVER_STATE_SLEEPING;
|
||||
GGML_ASSERT(false && "invalid server_state string");
|
||||
}
|
||||
|
||||
using server_state_callback_t = std::function<void(server_state, json /* payload */)>;
|
||||
|
||||
struct server_context {
|
||||
std::unique_ptr<server_context_impl> impl;
|
||||
|
||||
@@ -80,9 +104,8 @@ struct server_context {
|
||||
// not thread-safe, should only be used from the main thread
|
||||
server_context_meta get_meta() const;
|
||||
|
||||
// register a callback to be called when sleeping state changes
|
||||
// must be set before load_model() is called
|
||||
void on_sleeping_changed(std::function<void(bool)> callback);
|
||||
// note: must be set before load_model() is called
|
||||
void set_state_callback(server_state_callback_t callback);
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "server-common.h"
|
||||
#include "server-models.h"
|
||||
#include "server-context.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "preset.h"
|
||||
@@ -44,9 +45,7 @@ extern char **environ;
|
||||
#define DEFAULT_STOP_TIMEOUT 10 // seconds
|
||||
|
||||
#define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit"
|
||||
#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
|
||||
#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
|
||||
#define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string
|
||||
#define CMD_CHILD_TO_ROUTER_STATE "cmd_child_to_router:state:" // followed by json string
|
||||
|
||||
// address for child process, this is needed because router may run on 0.0.0.0
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
|
||||
@@ -904,12 +903,8 @@ void server_models::load(const std::string & name) {
|
||||
while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) {
|
||||
LOG("[%5d] %s", port, buffer);
|
||||
std::string str(buffer);
|
||||
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
|
||||
this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
|
||||
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
|
||||
this->update_loaded_info(name, str);
|
||||
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
|
||||
this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
|
||||
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_STATE)) {
|
||||
this->handle_child_state(name, str);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -976,7 +971,10 @@ void server_models::load(const std::string & name) {
|
||||
subprocess_destroy(&child_proc->get());
|
||||
|
||||
// update status and exit code
|
||||
this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
|
||||
this->update_status(name, {
|
||||
SERVER_MODEL_STATUS_UNLOADED,
|
||||
exit_code
|
||||
});
|
||||
SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
|
||||
});
|
||||
|
||||
@@ -1016,7 +1014,8 @@ struct server_models_download_res : public common_download_callback {
|
||||
common_download_model(model, opts);
|
||||
is_ok = true;
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("download failed for model name=%s: %s\n", model.name.c_str(), e.what());
|
||||
auto model_name = model.get_name();
|
||||
SRV_ERR("download failed for model name=%s: %s\n", model_name.c_str(), e.what());
|
||||
is_ok = false;
|
||||
}
|
||||
return is_ok;
|
||||
@@ -1036,7 +1035,7 @@ struct server_models_download_res : public common_download_callback {
|
||||
};
|
||||
|
||||
void server_models::download(common_params_model && model, common_download_opts && opts) {
|
||||
std::string name = model.name;
|
||||
std::string name = model.get_name();
|
||||
GGML_ASSERT(name == model.hf_repo);
|
||||
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
@@ -1064,9 +1063,10 @@ void server_models::download(common_params_model && model, common_download_opts
|
||||
inst.th = std::thread([this, dl = std::move(dl)]() {
|
||||
dl->opts.callback = dl.get();
|
||||
bool ok = dl->run();
|
||||
auto model_name = dl->model.get_name();
|
||||
SRV_INF("download finished for model name=%s with status=%s\n",
|
||||
dl->model.name.c_str(), ok ? "success" : "failure");
|
||||
update_download_progress(dl->model.name, {}, true, ok);
|
||||
model_name.c_str(), ok ? "success" : "failure");
|
||||
update_download_progress(model_name, {}, true, ok);
|
||||
// need_reload is set inside update_download_progress under the mutex;
|
||||
// the next load_models() call will clean up this instance
|
||||
});
|
||||
@@ -1130,21 +1130,27 @@ void server_models::unload_all() {
|
||||
}
|
||||
}
|
||||
|
||||
void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
|
||||
void server_models::update_status(const std::string & name, const update_status_args & args) {
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
auto it = mapping.find(name);
|
||||
if (it != mapping.end()) {
|
||||
auto & meta = it->second.meta;
|
||||
meta.status = status;
|
||||
meta.exit_code = exit_code;
|
||||
meta.status = args.status;
|
||||
meta.exit_code = args.exit_code;
|
||||
if (!args.loaded_info.is_null()) {
|
||||
meta.loaded_info = args.loaded_info;
|
||||
}
|
||||
}
|
||||
// broadcast status change to SSE
|
||||
{
|
||||
json data = {
|
||||
{"status", server_model_status_to_string(status)},
|
||||
{"status", server_model_status_to_string(args.status)},
|
||||
};
|
||||
if (status == SERVER_MODEL_STATUS_UNLOADED) {
|
||||
data["exit_code"] = exit_code;
|
||||
if (args.status == SERVER_MODEL_STATUS_UNLOADED) {
|
||||
data["exit_code"] = args.exit_code;
|
||||
}
|
||||
if (!args.loaded_info.is_null()) {
|
||||
data["info"] = args.loaded_info;
|
||||
}
|
||||
// note: notify_sse doesn't acquire the lock, so no deadlock here
|
||||
notify_sse("status_change", name, data);
|
||||
@@ -1152,29 +1158,6 @@ void server_models::update_status(const std::string & name, server_model_status
|
||||
cv.notify_all();
|
||||
}
|
||||
|
||||
void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
|
||||
if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
|
||||
SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
json info;
|
||||
try {
|
||||
info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
|
||||
} catch (const std::exception & e) {
|
||||
SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
|
||||
return;
|
||||
}
|
||||
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
auto it = mapping.find(name);
|
||||
if (it != mapping.end()) {
|
||||
auto & meta = it->second.meta;
|
||||
meta.loaded_info = info;
|
||||
}
|
||||
cv.notify_all();
|
||||
}
|
||||
|
||||
void server_models::update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok) {
|
||||
json curr;
|
||||
{
|
||||
@@ -1323,21 +1306,54 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
|
||||
return proxy;
|
||||
}
|
||||
|
||||
bool server_models::is_child_server() {
|
||||
void server_models::handle_child_state(const std::string & name, const std::string & raw_input) {
|
||||
server_state state;
|
||||
json payload;
|
||||
|
||||
try {
|
||||
json data = json::parse(raw_input.substr(strlen(CMD_CHILD_TO_ROUTER_STATE)));
|
||||
state = server_state_from_str(json_value(data, "state", std::string()));
|
||||
payload = json_value(data, "payload", json{});
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("failed to parse child state update for name=%s: %s\n", name.c_str(), e.what());
|
||||
return;
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case SERVER_STATE_LOADING:
|
||||
{
|
||||
// do nothing for now
|
||||
// TODO: report loading progress for first load and wakeup from sleep
|
||||
} break;
|
||||
case SERVER_STATE_READY:
|
||||
{
|
||||
update_status(name, {
|
||||
SERVER_MODEL_STATUS_LOADED,
|
||||
0,
|
||||
// note: payload can be empty if this is a wakeup from sleep
|
||||
payload.size() > 0 ? payload : nullptr
|
||||
});
|
||||
} break;
|
||||
case SERVER_STATE_SLEEPING:
|
||||
{
|
||||
update_status(name, { SERVER_MODEL_STATUS_SLEEPING });
|
||||
} break;
|
||||
default:
|
||||
// should never happen, but just in case
|
||||
GGML_ASSERT(false && "unexpected state from child server");
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// server_child
|
||||
//
|
||||
|
||||
bool server_child::is_child() {
|
||||
const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
|
||||
return router_port != nullptr;
|
||||
}
|
||||
|
||||
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
|
||||
// send a notification to the router server that a model instance is ready
|
||||
common_log_pause(common_log_main());
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
|
||||
fflush(stdout);
|
||||
common_log_resume(common_log_main());
|
||||
|
||||
std::thread server_child::setup(const std::function<void(int)> & shutdown_handler) {
|
||||
// setup thread for monitoring stdin
|
||||
return std::thread([shutdown_handler]() {
|
||||
// wait for EOF on stdin
|
||||
@@ -1363,10 +1379,14 @@ std::thread server_models::setup_child_server(const std::function<void(int)> & s
|
||||
});
|
||||
}
|
||||
|
||||
void server_models::notify_router_sleeping_state(bool is_sleeping) {
|
||||
void server_child::notify_to_router(const std::string & state, const json & payload) {
|
||||
json data = {
|
||||
{"state", state},
|
||||
{"payload", payload},
|
||||
};
|
||||
common_log_pause(common_log_main());
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "%s\n", is_sleeping ? CMD_CHILD_TO_ROUTER_SLEEP : CMD_CHILD_TO_ROUTER_READY);
|
||||
fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_STATE, safe_json_to_str(data).c_str());
|
||||
fflush(stdout);
|
||||
common_log_resume(common_log_main());
|
||||
}
|
||||
@@ -1474,7 +1494,6 @@ void server_models_routes::init_routes() {
|
||||
}},
|
||||
// New key
|
||||
{"ui_settings", ui_settings},
|
||||
{"webui_settings", webui_settings},
|
||||
{"build_info", std::string(llama_build_info())},
|
||||
{"cors_proxy_enabled", params.ui_mcp_proxy},
|
||||
});
|
||||
@@ -1645,7 +1664,6 @@ void server_models_routes::init_routes() {
|
||||
common_params_model model;
|
||||
common_download_opts opts;
|
||||
|
||||
model.name = name;
|
||||
model.hf_repo = name;
|
||||
opts.bearer_token = params.hf_token;
|
||||
opts.download_mmproj = true;
|
||||
|
||||
@@ -171,8 +171,12 @@ public:
|
||||
void download(common_params_model && model, common_download_opts && opts);
|
||||
|
||||
// update the status of a model instance (thread-safe)
|
||||
void update_status(const std::string & name, server_model_status status, int exit_code);
|
||||
void update_loaded_info(const std::string & name, std::string & raw_info);
|
||||
struct update_status_args {
|
||||
server_model_status status;
|
||||
int exit_code = 0; // only valid if status == UNLOADED
|
||||
json loaded_info = nullptr;
|
||||
};
|
||||
void update_status(const std::string & name, const update_status_args & args);
|
||||
void update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok = true);
|
||||
|
||||
// remove a cache model from disk and update the list (thread-safe)
|
||||
@@ -193,21 +197,32 @@ public:
|
||||
// proxy an HTTP request to the model instance
|
||||
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
|
||||
|
||||
// handle message sent from server_child::notify_to_router()
|
||||
// raw input must starts with CMD_CHILD_TO_ROUTER_STATE, followed by a JSON string
|
||||
// this function is not thread-safe, must be called from instance's monitoring thread
|
||||
// payload per state:
|
||||
// state = loading -> payload = {} (TODO: add progress info)
|
||||
// state = ready -> payload = model_info (json), or {} if wakeup from sleeping
|
||||
// state = sleeping -> payload = {}
|
||||
void handle_child_state(const std::string & name, const std::string & raw_input);
|
||||
};
|
||||
|
||||
struct server_child {
|
||||
// return true if the current process is a child server instance
|
||||
static bool is_child_server();
|
||||
bool is_child();
|
||||
|
||||
// notify the router server that a model instance is ready
|
||||
// register the shutdown_handler to be called by the router
|
||||
// return the monitoring thread (to be joined by the caller)
|
||||
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
|
||||
std::thread setup(const std::function<void(int)> & shutdown_handler);
|
||||
|
||||
// notify the router server that the sleeping state has changed
|
||||
static void notify_router_sleeping_state(bool sleeping);
|
||||
// notify router server for status changes (e.g. loading, downloading, sleeping, etc.)
|
||||
// message will be handled by server_models::handle_child_state() on the router side
|
||||
void notify_to_router(const std::string & state_name, const json & payload);
|
||||
};
|
||||
|
||||
struct server_models_routes {
|
||||
common_params params;
|
||||
json ui_settings = json::object(); // Primary: new name
|
||||
json webui_settings = json::object(); // Deprecated: use ui_settings (kept for compat)
|
||||
std::atomic<bool> stopping = false; // for graceful disconnecting SSE clients during shutdown
|
||||
server_models models;
|
||||
server_models_routes(const common_params & params, int argc, char ** argv)
|
||||
@@ -217,7 +232,6 @@ struct server_models_routes {
|
||||
try {
|
||||
json json_settings = json::parse(cfg);
|
||||
ui_settings = json_settings;
|
||||
webui_settings = json_settings; // Deprecated: keep in sync
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: failed to parse UI config: %s\n", __func__, e.what());
|
||||
throw;
|
||||
|
||||
+17
-12
@@ -90,8 +90,10 @@ int llama_server(int argc, char ** argv) {
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// router server never loads a model and must not touch the GPU
|
||||
const bool is_router_server = params.model.path.empty()
|
||||
&& params.model.hf_repo.empty();
|
||||
|
||||
// skip device enumeration so the CUDA primary context stays uncreated
|
||||
const bool is_router_server = params.model.path.empty();
|
||||
common_params_print_info(params, !is_router_server);
|
||||
|
||||
if (!is_router_server) {
|
||||
@@ -113,8 +115,9 @@ int llama_server(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// for consistency between server router mode and single-model mode, we set the same model name as alias
|
||||
if (params.model_alias.empty() && !params.model.name.empty()) {
|
||||
params.model_alias.insert(params.model.name);
|
||||
auto model_name = params.model.get_name();
|
||||
if (params.model_alias.empty() && !model_name.empty()) {
|
||||
params.model_alias.insert(model_name);
|
||||
}
|
||||
|
||||
// struct that contains llama context and inference
|
||||
@@ -255,6 +258,7 @@ int llama_server(int argc, char ** argv) {
|
||||
// Start the server
|
||||
//
|
||||
|
||||
server_child child; // only used in non-router mode
|
||||
std::function<void()> clean_up;
|
||||
|
||||
if (is_router_server) {
|
||||
@@ -300,15 +304,16 @@ int llama_server(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// load the model
|
||||
SRV_INF("%s", "loading model\n");
|
||||
|
||||
if (server_models::is_child_server()) {
|
||||
ctx_server.on_sleeping_changed([&](bool sleeping) {
|
||||
server_models::notify_router_sleeping_state(sleeping);
|
||||
// setup communication child --> router if necessary
|
||||
if (child.is_child()) {
|
||||
ctx_server.set_state_callback([&](server_state state, json payload) {
|
||||
child.notify_to_router(server_state_to_str(state), payload);
|
||||
});
|
||||
}
|
||||
|
||||
// load the model
|
||||
SRV_INF("%s", "loading model\n");
|
||||
|
||||
if (!ctx_server.load_model(params)) {
|
||||
clean_up();
|
||||
if (ctx_http.thread.joinable()) {
|
||||
@@ -365,9 +370,9 @@ int llama_server(int argc, char ** argv) {
|
||||
|
||||
// optionally, notify router server that this instance is ready
|
||||
std::thread monitor_thread;
|
||||
if (server_models::is_child_server()) {
|
||||
json model_info = routes.get_model_info();
|
||||
monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
|
||||
if (child.is_child()) {
|
||||
monitor_thread = child.setup(shutdown_handler);
|
||||
child.notify_to_router(server_state_to_str(SERVER_STATE_READY), routes.get_model_info());
|
||||
}
|
||||
|
||||
// this call blocks the main thread until queue_tasks.terminate() is called
|
||||
|
||||
@@ -79,9 +79,9 @@ def test_load_split_model():
|
||||
assert match_regex("(little|girl)+", res.body["content"])
|
||||
|
||||
|
||||
def test_no_webui():
|
||||
def test_no_ui():
|
||||
global server
|
||||
# default: webui enabled
|
||||
# default: UI enabled
|
||||
server.start()
|
||||
url = f"http://{server.server_host}:{server.server_port}"
|
||||
res = requests.get(url)
|
||||
@@ -89,8 +89,8 @@ def test_no_webui():
|
||||
assert "<!doctype html>" in res.text
|
||||
server.stop()
|
||||
|
||||
# with --no-webui
|
||||
server.no_webui = True
|
||||
# with --no-ui, the UI should be disabled
|
||||
server.no_ui = True
|
||||
server.start()
|
||||
res = requests.get(url)
|
||||
assert res.status_code == 404
|
||||
|
||||
@@ -12,7 +12,7 @@ def create_server():
|
||||
|
||||
def test_mcp_no_proxy():
|
||||
global server
|
||||
server.webui_mcp_proxy = False
|
||||
server.ui_mcp_proxy = False
|
||||
server.start()
|
||||
|
||||
res = server.make_request("GET", "/cors-proxy")
|
||||
@@ -21,7 +21,7 @@ def test_mcp_no_proxy():
|
||||
|
||||
def test_mcp_proxy():
|
||||
global server
|
||||
server.webui_mcp_proxy = True
|
||||
server.ui_mcp_proxy = True
|
||||
server.start()
|
||||
|
||||
url = f"http://{server.server_host}:{server.server_port}/cors-proxy?url=http://example.com"
|
||||
@@ -32,7 +32,7 @@ def test_mcp_proxy():
|
||||
|
||||
def test_mcp_proxy_custom_port():
|
||||
global server
|
||||
server.webui_mcp_proxy = True
|
||||
server.ui_mcp_proxy = True
|
||||
server.start()
|
||||
|
||||
# try getting the server's models API via the proxy
|
||||
|
||||
@@ -94,7 +94,7 @@ class ServerProcess:
|
||||
enable_ctx_shift: int | None = False
|
||||
spec_draft_n_min: int | None = None
|
||||
spec_draft_n_max: int | None = None
|
||||
no_webui: bool | None = None
|
||||
no_ui: bool | None = None
|
||||
jinja: bool | None = None
|
||||
reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
|
||||
reasoning: Literal['on', 'off', 'auto'] | None = None
|
||||
@@ -107,7 +107,7 @@ class ServerProcess:
|
||||
cache_ram: int | None = None
|
||||
no_cache_idle_slots: bool = False
|
||||
log_path: str | None = None
|
||||
webui_mcp_proxy: bool = False
|
||||
ui_mcp_proxy: bool = False
|
||||
backend_sampling: bool = False
|
||||
gcp_compat: bool = False
|
||||
|
||||
@@ -225,8 +225,8 @@ class ServerProcess:
|
||||
server_args.extend(["--spec-draft-n-max", self.spec_draft_n_max])
|
||||
if self.spec_draft_n_min:
|
||||
server_args.extend(["--spec-draft-n-min", self.spec_draft_n_min])
|
||||
if self.no_webui:
|
||||
server_args.append("--no-webui")
|
||||
if self.no_ui:
|
||||
server_args.append("--no-ui")
|
||||
if self.no_models_autoload:
|
||||
server_args.append("--no-models-autoload")
|
||||
if self.jinja:
|
||||
@@ -251,8 +251,8 @@ class ServerProcess:
|
||||
server_args.extend(["--cache-ram", self.cache_ram])
|
||||
if self.no_cache_idle_slots:
|
||||
server_args.append("--no-cache-idle-slots")
|
||||
if self.webui_mcp_proxy:
|
||||
server_args.append("--webui-mcp-proxy")
|
||||
if self.ui_mcp_proxy:
|
||||
server_args.append("--ui-mcp-proxy")
|
||||
if self.backend_sampling:
|
||||
server_args.append("--backend_sampling")
|
||||
if self.gcp_compat:
|
||||
|
||||
Reference in New Issue
Block a user