Compare commits

...

10 Commits

Author SHA1 Message Date
xiaobing318 e251e5ebbe cmake : add utf8 compilation options for msvc (#17682) 2025-12-02 19:50:57 +02:00
Chad Voegele c4357dcc35 Server: Change Invalid Schema from Server Error (500) to User Error (400) (#17572)
* Make invalid schema a user error (400)

* Move invalid_argument exception handler to ex_wrapper

* Fix test

* Simplify test back to original pattern
2025-12-02 17:33:50 +01:00
Adrien Gallouët e148380c7c ggml : use svcntb() for SVE vector length detection (#17474)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-12-02 18:21:11 +02:00
TianHao324 a2b0fe8d37 CANN: Disable Ger operator of OUT_PROD on 310p device (#17563) 2025-12-02 20:35:23 +08:00
Daniel Bevenius 7f3a72a8ed ggml : remove redundant n_copies check when setting input/output (#17612)
This commit removes a redundant check for sched->n_copies > 1 when
setting input and output flags on tensor copies in
ggml_backend_sched_split_graph.

The motivation for this change is to clarify the code as the outer if
statement already performs this check.
2025-12-02 12:52:45 +01:00
Eric Curtin b9a37717b0 codeowners : remove ericcurtin (#17658)
Taking a break from llama.cpp . I wasn't around at the start of llama.cpp
but I want to thank @ggerganov and @slaren for creating a neat community
here.

Signed-off-by: Eric Curtin <eric.curtin@docker.com>
2025-12-02 12:18:15 +01:00
Adrien Gallouët f3a9674ae8 llama : fix signed comparison warning on FreeBSD (#17497)
This ensures correct RLIM_INFINITY handling and compatibility on all platforms (32/64-bit).

    warning: comparison of integers of different signs: 'rlim_t' (aka 'long') and 'size_t' (aka 'unsigned long') [-Wsign-compare]
      488 |         if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
          |                         ~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-12-02 12:05:38 +01:00
Xuan-Son Nguyen 2c453c6c77 convert: add error message for mistral3 quantized weight (#17686) 2025-12-02 11:48:31 +01:00
Xuan-Son Nguyen 5d6bd842ea server: remove default "gpt-3.5-turbo" model name (#17668)
* server: remove default "gpt-3.5-turbo" model name

* do not reflect back model name from request

* fix test
2025-12-02 11:38:57 +01:00
senhtry fd3abe849e server: fixing naming conflict res_error in server-models.cpp (#17679) 2025-12-02 11:18:39 +01:00
17 changed files with 158 additions and 128 deletions
+2 -3
View File
@@ -7,7 +7,7 @@
/ci/ @ggerganov
/cmake/ @ggerganov
/common/CMakeLists.txt @ggerganov
/common/arg.* @ggerganov @ericcurtin
/common/arg.* @ggerganov
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
/common/common.* @ggerganov
@@ -87,8 +87,7 @@
/tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
/tools/run/ @ericcurtin
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
/tools/server/* @ngxson @ggerganov # no subdir
/tools/server/webui/ @allozaur
/tools/tokenize/ @ggerganov
/tools/tts/ @ggerganov
+16 -16
View File
@@ -163,7 +163,7 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
if (tool_choice == "required") {
return COMMON_CHAT_TOOL_CHOICE_REQUIRED;
}
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
throw std::invalid_argument("Invalid tool_choice: " + tool_choice);
}
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
@@ -186,17 +186,17 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
try {
if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump());
throw std::invalid_argument("Expected 'messages' to be an array, got " + messages.dump());
}
for (const auto & message : messages) {
if (!message.is_object()) {
throw std::runtime_error("Expected 'message' to be an object, got " + message.dump());
throw std::invalid_argument("Expected 'message' to be an object, got " + message.dump());
}
common_chat_msg msg;
if (!message.contains("role")) {
throw std::runtime_error("Missing 'role' in message: " + message.dump());
throw std::invalid_argument("Missing 'role' in message: " + message.dump());
}
msg.role = message.at("role");
@@ -209,11 +209,11 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
} else if (content.is_array()) {
for (const auto & part : content) {
if (!part.contains("type")) {
throw std::runtime_error("Missing content part type: " + part.dump());
throw std::invalid_argument("Missing content part type: " + part.dump());
}
const auto & type = part.at("type");
if (type != "text") {
throw std::runtime_error("Unsupported content part type: " + type.dump());
throw std::invalid_argument("Unsupported content part type: " + type.dump());
}
common_chat_msg_content_part msg_part;
msg_part.type = type;
@@ -221,25 +221,25 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
msg.content_parts.push_back(msg_part);
}
} else if (!content.is_null()) {
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
throw std::invalid_argument("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
}
}
if (has_tool_calls) {
for (const auto & tool_call : message.at("tool_calls")) {
common_chat_tool_call tc;
if (!tool_call.contains("type")) {
throw std::runtime_error("Missing tool call type: " + tool_call.dump());
throw std::invalid_argument("Missing tool call type: " + tool_call.dump());
}
const auto & type = tool_call.at("type");
if (type != "function") {
throw std::runtime_error("Unsupported tool call type: " + tool_call.dump());
throw std::invalid_argument("Unsupported tool call type: " + tool_call.dump());
}
if (!tool_call.contains("function")) {
throw std::runtime_error("Missing tool call function: " + tool_call.dump());
throw std::invalid_argument("Missing tool call function: " + tool_call.dump());
}
const auto & fc = tool_call.at("function");
if (!fc.contains("name")) {
throw std::runtime_error("Missing tool call name: " + tool_call.dump());
throw std::invalid_argument("Missing tool call name: " + tool_call.dump());
}
tc.name = fc.at("name");
tc.arguments = fc.at("arguments");
@@ -250,7 +250,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
}
}
if (!has_content && !has_tool_calls) {
throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
throw std::invalid_argument("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
}
if (message.contains("reasoning_content")) {
msg.reasoning_content = message.at("reasoning_content");
@@ -353,18 +353,18 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
try {
if (!tools.is_null()) {
if (!tools.is_array()) {
throw std::runtime_error("Expected 'tools' to be an array, got " + tools.dump());
throw std::invalid_argument("Expected 'tools' to be an array, got " + tools.dump());
}
for (const auto & tool : tools) {
if (!tool.contains("type")) {
throw std::runtime_error("Missing tool type: " + tool.dump());
throw std::invalid_argument("Missing tool type: " + tool.dump());
}
const auto & type = tool.at("type");
if (!type.is_string() || type != "function") {
throw std::runtime_error("Unsupported tool type: " + tool.dump());
throw std::invalid_argument("Unsupported tool type: " + tool.dump());
}
if (!tool.contains("function")) {
throw std::runtime_error("Missing tool function: " + tool.dump());
throw std::invalid_argument("Missing tool function: " + tool.dump());
}
const auto & function = tool.at("function");
+1 -1
View File
@@ -974,7 +974,7 @@ public:
void check_errors() {
if (!_errors.empty()) {
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
}
if (!_warnings.empty()) {
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
+4
View File
@@ -2842,6 +2842,10 @@ class Mistral3Model(LlamaModel):
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
# TODO: probably not worth supporting quantized weight, as official BF16 is also available
if name.endswith("weight_scale_inv"):
raise ValueError("This is a quantized weight, please use BF16 weight instead")
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "vision_tower" in name:
return []
+47 -42
View File
@@ -408,62 +408,67 @@ if (MSVC)
/wd4996 # Disable POSIX deprecation warnings
/wd4702 # Unreachable code warnings
)
function(disable_msvc_warnings target_name)
set(MSVC_COMPILE_OPTIONS
"$<$<COMPILE_LANGUAGE:C>:/utf-8>"
"$<$<COMPILE_LANGUAGE:CXX>:/utf-8>"
)
function(configure_msvc_target target_name)
if(TARGET ${target_name})
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
target_compile_options(${target_name} PRIVATE ${MSVC_COMPILE_OPTIONS})
endif()
endfunction()
disable_msvc_warnings(ggml-base)
disable_msvc_warnings(ggml)
disable_msvc_warnings(ggml-cpu)
disable_msvc_warnings(ggml-cpu-x64)
disable_msvc_warnings(ggml-cpu-sse42)
disable_msvc_warnings(ggml-cpu-sandybridge)
disable_msvc_warnings(ggml-cpu-haswell)
disable_msvc_warnings(ggml-cpu-skylakex)
disable_msvc_warnings(ggml-cpu-icelake)
disable_msvc_warnings(ggml-cpu-alderlake)
configure_msvc_target(ggml-base)
configure_msvc_target(ggml)
configure_msvc_target(ggml-cpu)
configure_msvc_target(ggml-cpu-x64)
configure_msvc_target(ggml-cpu-sse42)
configure_msvc_target(ggml-cpu-sandybridge)
configure_msvc_target(ggml-cpu-haswell)
configure_msvc_target(ggml-cpu-skylakex)
configure_msvc_target(ggml-cpu-icelake)
configure_msvc_target(ggml-cpu-alderlake)
if (GGML_BUILD_EXAMPLES)
disable_msvc_warnings(common-ggml)
disable_msvc_warnings(common)
configure_msvc_target(common-ggml)
configure_msvc_target(common)
disable_msvc_warnings(mnist-common)
disable_msvc_warnings(mnist-eval)
disable_msvc_warnings(mnist-train)
configure_msvc_target(mnist-common)
configure_msvc_target(mnist-eval)
configure_msvc_target(mnist-train)
disable_msvc_warnings(gpt-2-ctx)
disable_msvc_warnings(gpt-2-alloc)
disable_msvc_warnings(gpt-2-backend)
disable_msvc_warnings(gpt-2-sched)
disable_msvc_warnings(gpt-2-quantize)
disable_msvc_warnings(gpt-2-batched)
configure_msvc_target(gpt-2-ctx)
configure_msvc_target(gpt-2-alloc)
configure_msvc_target(gpt-2-backend)
configure_msvc_target(gpt-2-sched)
configure_msvc_target(gpt-2-quantize)
configure_msvc_target(gpt-2-batched)
disable_msvc_warnings(gpt-j)
disable_msvc_warnings(gpt-j-quantize)
configure_msvc_target(gpt-j)
configure_msvc_target(gpt-j-quantize)
disable_msvc_warnings(magika)
disable_msvc_warnings(yolov3-tiny)
disable_msvc_warnings(sam)
configure_msvc_target(magika)
configure_msvc_target(yolov3-tiny)
configure_msvc_target(sam)
disable_msvc_warnings(simple-ctx)
disable_msvc_warnings(simple-backend)
configure_msvc_target(simple-ctx)
configure_msvc_target(simple-backend)
endif()
if (GGML_BUILD_TESTS)
disable_msvc_warnings(test-mul-mat)
disable_msvc_warnings(test-arange)
disable_msvc_warnings(test-backend-ops)
disable_msvc_warnings(test-cont)
disable_msvc_warnings(test-conv-transpose)
disable_msvc_warnings(test-conv-transpose-1d)
disable_msvc_warnings(test-conv1d)
disable_msvc_warnings(test-conv2d)
disable_msvc_warnings(test-conv2d-dw)
disable_msvc_warnings(test-customop)
disable_msvc_warnings(test-dup)
disable_msvc_warnings(test-opt)
disable_msvc_warnings(test-pool)
configure_msvc_target(test-mul-mat)
configure_msvc_target(test-arange)
configure_msvc_target(test-backend-ops)
configure_msvc_target(test-cont)
configure_msvc_target(test-conv-transpose)
configure_msvc_target(test-conv-transpose-1d)
configure_msvc_target(test-conv1d)
configure_msvc_target(test-conv2d)
configure_msvc_target(test-conv2d-dw)
configure_msvc_target(test-customop)
configure_msvc_target(test-dup)
configure_msvc_target(test-opt)
configure_msvc_target(test-pool)
endif ()
endif()
+2 -4
View File
@@ -1240,10 +1240,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
}
if (sched->n_copies > 1) {
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy");
}
+4
View File
@@ -2564,6 +2564,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
return true;
case GGML_OP_OUT_PROD:
{
#ifdef ASCEND_310P
// Ger is not supported on 310p device
return false;
#endif
switch (op->src[0]->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
+6 -14
View File
@@ -683,22 +683,14 @@ bool ggml_is_numa(void) {
}
#if defined(__ARM_ARCH)
#if defined(__linux__) && defined(__aarch64__)
#include <sys/auxv.h>
#endif
static void ggml_init_arm_arch_features(void) {
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
#if defined(__linux__)
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
#else
// TODO: add support of SVE for non-linux systems
#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
#endif
#endif
#include <arm_sve.h>
static void ggml_init_arm_arch_features(void) {
ggml_arm_arch_features.sve_cnt = svcntb();
}
#else
static void ggml_init_arm_arch_features(void) {}
#endif
#endif // __ARM_ARCH
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
+1 -1
View File
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
suggest = false;
}
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
suggest = false;
}
#endif
+1 -1
View File
@@ -1375,7 +1375,7 @@ int main() {
try {
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
tc.verify_status(SUCCESS);
} catch (const std::runtime_error & ex) {
} catch (const std::invalid_argument & ex) {
fprintf(stderr, "Error: %s\n", ex.what());
tc.verify_status(FAILURE);
}
+26 -21
View File
@@ -819,26 +819,26 @@ json oaicompat_chat_params_parse(
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
json_schema = json_value(schema_wrapper, "schema", json::object());
} else if (!response_type.empty() && response_type != "text") {
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
throw std::invalid_argument("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
}
}
// get input files
if (!body.contains("messages")) {
throw std::runtime_error("'messages' is required");
throw std::invalid_argument("'messages' is required");
}
json & messages = body.at("messages");
if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array");
throw std::invalid_argument("Expected 'messages' to be an array");
}
for (auto & msg : messages) {
std::string role = json_value(msg, "role", std::string());
if (role != "assistant" && !msg.contains("content")) {
throw std::runtime_error("All non-assistant messages must contain 'content'");
throw std::invalid_argument("All non-assistant messages must contain 'content'");
}
if (role == "assistant") {
if (!msg.contains("content") && !msg.contains("tool_calls")) {
throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
throw std::invalid_argument("Assistant message must contain either 'content' or 'tool_calls'!");
}
if (!msg.contains("content")) {
continue; // avoid errors with no content
@@ -850,7 +850,7 @@ json oaicompat_chat_params_parse(
}
if (!content.is_array()) {
throw std::runtime_error("Expected 'content' to be a string or an array");
throw std::invalid_argument("Expected 'content' to be a string or an array");
}
for (auto & p : content) {
@@ -884,11 +884,11 @@ json oaicompat_chat_params_parse(
// try to decode base64 image
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
if (parts.size() != 2) {
throw std::runtime_error("Invalid image_url.url value");
throw std::invalid_argument("Invalid image_url.url value");
} else if (!string_starts_with(parts[0], "data:image/")) {
throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
throw std::invalid_argument("Invalid image_url.url format: " + parts[0]);
} else if (!string_ends_with(parts[0], "base64")) {
throw std::runtime_error("image_url.url must be base64 encoded");
throw std::invalid_argument("image_url.url must be base64 encoded");
} else {
auto base64_data = parts[1];
auto decoded_data = base64_decode(base64_data);
@@ -911,7 +911,7 @@ json oaicompat_chat_params_parse(
std::string format = json_value(input_audio, "format", std::string());
// while we also support flac, we don't allow it here so we matches the OAI spec
if (format != "wav" && format != "mp3") {
throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'");
}
auto decoded_data = base64_decode(data); // expected to be base64 encoded
out_files.push_back(decoded_data);
@@ -922,7 +922,7 @@ json oaicompat_chat_params_parse(
p.erase("input_audio");
} else if (type != "text") {
throw std::runtime_error("unsupported content[].type");
throw std::invalid_argument("unsupported content[].type");
}
}
}
@@ -940,7 +940,7 @@ json oaicompat_chat_params_parse(
inputs.enable_thinking = opt.enable_thinking;
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
if (body.contains("grammar")) {
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
throw std::invalid_argument("Cannot use custom grammar constraints with tools.");
}
llama_params["parse_tool_calls"] = true;
}
@@ -959,7 +959,7 @@ json oaicompat_chat_params_parse(
} else if (enable_thinking_kwarg == "false") {
inputs.enable_thinking = false;
} else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)");
}
// if the assistant message appears at the end of list, we do not add end-of-turn token
@@ -972,14 +972,14 @@ json oaicompat_chat_params_parse(
/* sanity check, max one assistant message at the end of the list */
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
}
/* TODO: test this properly */
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
if ( inputs.enable_thinking ) {
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
throw std::invalid_argument("Assistant response prefill is incompatible with enable_thinking.");
}
inputs.add_generation_prompt = true;
@@ -1020,18 +1020,18 @@ json oaicompat_chat_params_parse(
// Handle "n" field
int n_choices = json_value(body, "n", 1);
if (n_choices != 1) {
throw std::runtime_error("Only one completion choice is allowed");
throw std::invalid_argument("Only one completion choice is allowed");
}
// Handle "logprobs" field
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
if (json_value(body, "logprobs", false)) {
if (has_tools && stream) {
throw std::runtime_error("logprobs is not supported with tools + stream");
throw std::invalid_argument("logprobs is not supported with tools + stream");
}
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
} else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
throw std::invalid_argument("top_logprobs requires logprobs to be set to true");
}
// Copy remaining properties to llama_params
@@ -1263,7 +1263,11 @@ json convert_anthropic_to_oai(const json & body) {
return oai_body;
}
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) {
json format_embeddings_response_oaicompat(
const json & request,
const std::string & model_name,
const json & embeddings,
bool use_base64) {
json data = json::array();
int32_t n_tokens = 0;
int i = 0;
@@ -1293,7 +1297,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
}
json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"model", json_value(request, "model", model_name)},
{"object", "list"},
{"usage", json {
{"prompt_tokens", n_tokens},
@@ -1307,6 +1311,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
json format_response_rerank(
const json & request,
const std::string & model_name,
const json & ranks,
bool is_tei_format,
std::vector<std::string> & texts,
@@ -1338,7 +1343,7 @@ json format_response_rerank(
if (is_tei_format) return results;
json res = json{
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"model", json_value(request, "model", model_name)},
{"object", "list"},
{"usage", json{
{"prompt_tokens", n_tokens},
+6 -3
View File
@@ -13,8 +13,6 @@
#include <vector>
#include <cinttypes>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
using json = nlohmann::ordered_json;
@@ -298,11 +296,16 @@ json oaicompat_chat_params_parse(
json convert_anthropic_to_oai(const json & body);
// TODO: move it to server-task.cpp
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
json format_embeddings_response_oaicompat(
const json & request,
const std::string & model_name,
const json & embeddings,
bool use_base64 = false);
// TODO: move it to server-task.cpp
json format_response_rerank(
const json & request,
const std::string & model_name,
const json & ranks,
bool is_tei_format,
std::vector<std::string> & texts,
+22 -6
View File
@@ -17,6 +17,7 @@
#include <cinttypes>
#include <memory>
#include <unordered_set>
#include <filesystem>
// fix problem with std::min and std::max
#if defined(_WIN32)
@@ -518,6 +519,8 @@ struct server_context_impl {
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
std::string model_name; // name of the loaded model, to be used by API
common_chat_templates_ptr chat_templates;
oaicompat_parser_options oai_parser_opt;
@@ -758,6 +761,18 @@ struct server_context_impl {
}
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
if (!params_base.model_alias.empty()) {
// user explicitly specified model name
model_name = params_base.model_alias;
} else if (!params_base.model.name.empty()) {
// use model name in registry format (for models in cache)
model_name = params_base.model.name;
} else {
// fallback: derive model name from file name
auto model_path = std::filesystem::path(params_base.model.path);
model_name = model_path.filename().string();
}
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 2. The chat template supports it
@@ -2611,7 +2626,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
// OAI-compat
task.params.res_type = res_type;
task.params.oaicompat_cmpl_id = completion_id;
// oaicompat_model is already populated by params_from_json_cmpl
task.params.oaicompat_model = ctx_server.model_name;
tasks.push_back(std::move(task));
}
@@ -2939,7 +2954,7 @@ void server_routes::init_routes() {
json data = {
{ "default_generation_settings", default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_alias", ctx_server.params_base.model_alias },
{ "model_alias", ctx_server.model_name },
{ "model_path", ctx_server.params_base.model.path },
{ "modalities", json {
{"vision", ctx_server.oai_parser_opt.allow_image},
@@ -3181,8 +3196,8 @@ void server_routes::init_routes() {
json models = {
{"models", {
{
{"name", params.model_alias.empty() ? params.model.path : params.model_alias},
{"model", params.model_alias.empty() ? params.model.path : params.model_alias},
{"name", ctx_server.model_name},
{"model", ctx_server.model_name},
{"modified_at", ""},
{"size", ""},
{"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
@@ -3204,7 +3219,7 @@ void server_routes::init_routes() {
{"object", "list"},
{"data", {
{
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
{"id", ctx_server.model_name},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
@@ -3351,6 +3366,7 @@ void server_routes::init_routes() {
// write JSON response
json root = format_response_rerank(
body,
ctx_server.model_name,
responses,
is_tei_format,
documents,
@@ -3613,7 +3629,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
// write JSON response
json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
? format_embeddings_response_oaicompat(body, responses, use_base64)
? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64)
: json(responses);
res->ok(root);
return res;
+8 -8
View File
@@ -642,26 +642,26 @@ static void res_ok(std::unique_ptr<server_http_res> & res, const json & response
res->data = safe_json_to_str(response_data);
}
static void res_error(std::unique_ptr<server_http_res> & res, const json & error_data) {
static void res_err(std::unique_ptr<server_http_res> & res, const json & error_data) {
res->status = json_value(error_data, "code", 500);
res->data = safe_json_to_str({{ "error", error_data }});
}
static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
if (name.empty()) {
res_error(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
return false;
}
auto meta = models.get_meta(name);
if (!meta.has_value()) {
res_error(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
return false;
}
if (models_autoload) {
models.ensure_model_loaded(name);
} else {
if (meta->status != SERVER_MODEL_STATUS_LOADED) {
res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
return false;
}
}
@@ -761,11 +761,11 @@ void server_models_routes::init_routes() {
std::string name = json_value(body, "model", std::string());
auto model = models.get_meta(name);
if (!model.has_value()) {
res_error(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
return res;
}
if (model->status == SERVER_MODEL_STATUS_LOADED) {
res_error(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
models.load(name, false);
@@ -823,11 +823,11 @@ void server_models_routes::init_routes() {
std::string name = json_value(body, "model", std::string());
auto model = models.get_meta(name);
if (!model.has_value()) {
res_error(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
return res;
}
if (model->status != SERVER_MODEL_STATUS_LOADED) {
res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
models.unload(name);
-3
View File
@@ -450,9 +450,6 @@ task_params server_task::params_from_json_cmpl(
}
}
std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
params.oaicompat_model = json_value(data, "model", model_name);
return params;
}
+7 -1
View File
@@ -34,18 +34,24 @@ static inline void signal_handler(int signal) {
static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
std::string message;
error_type error;
try {
return func(req);
} catch (const std::invalid_argument & e) {
error = ERROR_TYPE_INVALID_REQUEST;
message = e.what();
} catch (const std::exception & e) {
error = ERROR_TYPE_SERVER;
message = e.what();
} catch (...) {
error = ERROR_TYPE_SERVER;
message = "unknown error";
}
auto res = std::make_unique<server_http_res>();
res->status = 500;
try {
json error_data = format_error_response(message, ERROR_TYPE_SERVER);
json error_data = format_error_response(message, error);
res->status = json_value(error_data, "code", 500);
res->data = safe_json_to_str({{ "error", error_data }});
SRV_WRN("got exception: %s\n", res->data.c_str());
@@ -41,7 +41,8 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
assert res.status_code == 200
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
assert res.body["system_fingerprint"].startswith("b")
assert res.body["model"] == model if model is not None else server.model_alias
# we no longer reflect back the model name, see https://github.com/ggml-org/llama.cpp/pull/17668
# assert res.body["model"] == model if model is not None else server.model_alias
assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted
choice = res.body["choices"][0]
@@ -59,7 +60,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
)
def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
global server
server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
server.model_alias = "llama-test-model"
server.start()
res = server.make_stream_request("POST", "/chat/completions", data={
"max_tokens": max_tokens,
@@ -81,7 +82,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
else:
assert "role" not in choice["delta"]
assert data["system_fingerprint"].startswith("b")
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
assert data["model"] == "llama-test-model"
if last_cmpl_id is None:
last_cmpl_id = data["id"]
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
@@ -198,7 +199,7 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
choice = res.body["choices"][0]
assert match_regex(re_content, choice["message"]["content"])
else:
assert res.status_code != 200
assert res.status_code == 400
assert "error" in res.body