Compare commits

..

17 Commits

Author SHA1 Message Date
Evan Huus 23fbfcb1ad server: Parse port numbers from MCP server URLs in CORS proxy (#20208)
* Parse port numbers from MCP server URLs

* Pass scheme to http proxy for determining whether to use SSL

* Fix download on non-standard port and re-add port to logging

* add test

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2026-03-09 17:47:54 +01:00
Paul Flynn e22cd0aa15 metal : extend mul_mv_ext to BF16, Q2_K, Q3_K (#20250)
Enable mul_mv_ext small-batch kernels (BS 2-8) for BF16, Q2_K,
and Q3_K quantization types. These types previously fell through
to the slower single-row mul_mv path.

BF16 uses the float4 dequantize path (like F16). Q2_K and Q3_K
use the float4x4 K-quant path (like Q4_K/Q5_K/Q6_K).

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 16:48:12 +02:00
Georgi Gerganov 96cfc4992c server : fix checkpoints n_tokens calculation (#20287) 2026-03-09 16:47:06 +02:00
Georgi Gerganov ed0007aa32 metal : add upscale (#20284) 2026-03-09 16:45:11 +02:00
Georgi Gerganov 344ee2a38a server : warn swa-full is not supported for non-SWA models (#20291) 2026-03-09 16:44:25 +02:00
Georgi Gerganov d6e1556499 server : fix off-by-1 in server_tokens::size_up_to_pos() (#20279)
* server : fix off-by-1 in server_tokens::size_up_to_pos()

* cont : fix typo [no ci]
2026-03-09 16:43:38 +02:00
Piotr Wilkin (ilintar) f76565db92 common: map developer role to system (#20215)
* Map developer role to system
* Simplify
2026-03-09 14:25:11 +01:00
Georgi Gerganov 43e1cbd6c1 models : fix assert in mamba2 graph (#20270) 2026-03-09 13:15:15 +02:00
Georgi Gerganov 107d599952 server : add kill switch when server is stuck (#20277) 2026-03-09 10:33:12 +02:00
Aman Gupta e8bbc736cb ggml-cuda: disable gdn for musa (#20278) 2026-03-09 16:15:36 +08:00
ddh0 b518195101 llama-quant : left-align tensor names in output (#20117) 2026-03-09 09:28:41 +02:00
Aman Gupta e2763a6723 contributing: limit open PRs for new contributors to 1 (#20036) 2026-03-09 15:05:34 +08:00
Bertay Eren 0beb8db3a0 ggml-vulkan: add SGN operator, auto-generate Vulkan.csv and ops.md (#20219) 2026-03-09 07:24:16 +01:00
Ruben Ortlam b2f460bd3c vulkan: skip zero size tensors in backend copies (#20233) 2026-03-09 07:23:45 +01:00
Michael Huang 5f4cdac385 cuda : display total and free VRAM capacity during device initialization (#20185) 2026-03-09 12:45:43 +08:00
Aaron Teo ae87863dc1 llama-bench: introduce -hf and -hff flags & use --mmap 1 by default (#20211) 2026-03-09 09:05:44 +08:00
Piotr Wilkin (ilintar) 97c64fbdbd PEG parser for LFM2 (#20251)
* PEG parser for LFM2

* Simplify using python_value()
2026-03-09 01:11:22 +01:00
30 changed files with 839 additions and 421 deletions
+1
View File
@@ -39,6 +39,7 @@ Before submitting your PR:
- For intricate features, consider opening a feature request first to discuss and align expectations
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.
After submitting your PR:
- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
+68
View File
@@ -476,6 +476,74 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
return force_tool_calls ? section : optional(section);
}
// Python-style tool calls: name(arg1="value1", arg2=123)
// Used only by LFM2 for now, so we don't merge it into autoparser
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
const nlohmann::json & tools,
bool parallel_tool_calls) {
if (!tools.is_array() || tools.empty()) {
return eps();
}
auto tool_choices = choice();
for (const auto & tool_def : tools) {
if (!tool_def.contains("function")) {
continue;
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
auto args = eps();
if (params.contains("properties") && !params["properties"].empty()) {
auto arg_choice = choice();
for (const auto & el : params["properties"].items()) {
const std::string & prop_name = el.key();
const auto & prop_def = el.value();
bool is_string_type = (prop_def.contains("type") && prop_def["type"] == "string");
auto arg_name_parser = literal(prop_name);
common_peg_parser arg_value_parser = eps();
auto string_value_parser = choice({
literal("\"") + tool_arg_string_value(json_string_content()) + literal("\""),
literal("'") + tool_arg_string_value(json_string_content()) + literal("'")
});
if (is_string_type) {
arg_value_parser = string_value_parser;
} else {
arg_value_parser = tool_arg_value(python_value());
}
// Full argument: name="value" or name=value
auto arg_rule = tool_arg(
tool_arg_open(eps()) +
tool_arg_name(arg_name_parser) +
literal("=") +
arg_value_parser +
tool_arg_close(eps())
);
arg_choice |= arg_rule;
}
args = arg_choice + zero_or_more("," + space() + arg_choice);
}
auto tool_parser = tool(tool_open(tool_name(literal(name)) + literal("(")) +
space() + tool_args(args) + space() + tool_close(literal(")"))
);
tool_choices |= rule("tool-" + name, tool_parser);
}
if (parallel_tool_calls) {
return "[" + space() + tool_choices + zero_or_more("," + space() + tool_choices) + space() + "]";
}
return "[" + space() + tool_choices + space() + "]";
}
// Helper: Parse dot notation key into prefix and field name
static std::pair<std::string, std::string> parse_key_spec(const std::string & key) {
auto dot_pos = key.find('.');
+5
View File
@@ -112,6 +112,11 @@ class common_chat_peg_builder : public common_peg_parser_builder {
bool parallel_tool_calls,
bool force_tool_calls);
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
// Used by LFM2 and similar templates
common_peg_parser python_style_tool_calls(const nlohmann::json & tools,
bool parallel_tool_calls);
private:
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
common_peg_parser build_json_tools_function_is_key(const nlohmann::json & tools,
+99
View File
@@ -1274,8 +1274,95 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
return data;
}
// LFM2 format:
// - Reasoning: <think>{reasoning}</think> (optional, only if enable_thinking is true)
// - Content: text after reasoning (optional)
// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
// Tool calls can appear multiple times (parallel tool calls)
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::templates_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_list_start|>",
"<|tool_list_end|>",
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
const std::string TOOL_CALL_START = "<|tool_call_start|>";
const std::string TOOL_CALL_END = "<|tool_call_end|>";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
p.literal(TOOL_CALL_END)
)
);
auto content = p.content(p.until(TOOL_CALL_START));
return reasoning + content + tool_calls + end;
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
};
}
return data;
}
namespace workaround {
static void map_developer_role_to_system(json & messages) {
for (auto & message : messages) {
if (message.contains("role")) {
if (message["role"] == "developer") {
message["role"] = "system";
}
}
}
}
// if first message is system and template does not support it, merge it with next message
static void system_message_not_supported(json & messages) {
if (!messages.empty() && messages.front().at("role") == "system") {
@@ -1353,6 +1440,10 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
params.add_bos = tmpls->add_bos;
params.add_eos = tmpls->add_eos;
if (src.find("<|channel|>") == std::string::npos) {
// map developer to system for all models except for GPT-OSS
workaround::map_developer_role_to_system(params.messages);
}
workaround::func_args_not_string(params.messages);
if (!tmpl.original_caps().supports_system_role) {
@@ -1422,6 +1513,14 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
return common_chat_params_init_kimi_k2(tmpl, params);
}
// LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
// Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}
try {
LOG_DBG("Using differential autoparser\n");
struct autoparser::autoparser autoparser;
+16 -1
View File
@@ -7,6 +7,7 @@ struct common_http_url {
std::string user;
std::string password;
std::string host;
int port;
std::string path;
};
@@ -47,6 +48,20 @@ static common_http_url common_http_parse_url(const std::string & url) {
parts.host = rest;
parts.path = "/";
}
auto colon_pos = parts.host.find(':');
if (colon_pos != std::string::npos) {
parts.port = std::stoi(parts.host.substr(colon_pos + 1));
parts.host = parts.host.substr(0, colon_pos);
} else if (parts.scheme == "http") {
parts.port = 80;
} else if (parts.scheme == "https") {
parts.port = 443;
} else {
throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
}
return parts;
}
@@ -68,7 +83,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
}
#endif
httplib::Client cli(parts.scheme + "://" + parts.host);
httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
if (!parts.user.empty()) {
cli.set_basic_auth(parts.user, parts.password);
+2 -1
View File
@@ -47,6 +47,7 @@ Legend:
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| GATED_DELTA_NET | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -92,7 +93,7 @@ Legend:
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | | ✅ | ❌ | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+17 -4
View File
@@ -1,8 +1,8 @@
"backend_name","op_name","op_params","test_mode","supported","error_message","backend_reg_name"
"Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
@@ -85,8 +85,8 @@
"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
"Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
@@ -13591,3 +13591,16 @@
"Vulkan0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","Vulkan"
"Vulkan0","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","0","no","Vulkan"
Can't render this file because it is too large.
+30 -8
View File
@@ -205,7 +205,14 @@ static ggml_cuda_device_info ggml_cuda_init() {
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
for (int id = 0; id < info.device_count; ++id) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
total_vram += prop.totalGlobalMem;
}
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices (Total VRAM: %zu MiB):\n",
__func__, info.device_count, (size_t)(total_vram / (1024 * 1024)));
total_vram = 0;
std::vector<std::pair<int, std::string>> turing_devices_without_mma;
for (int id = 0; id < info.device_count; ++id) {
@@ -243,6 +250,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else
info.devices[id].supports_cooperative_launch = false;
#endif // !(GGML_USE_MUSA)
// cudaMemGetInfo returns info for the current device
size_t free_mem;
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaMemGetInfo(&free_mem, NULL));
#if defined(GGML_USE_HIP)
info.devices[id].smpbo = prop.sharedMemPerBlock;
@@ -257,22 +270,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10;
}
}
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
device_vmm ? "yes" : "no", prop.warpSize);
device_vmm ? "yes" : "no", prop.warpSize,
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
#elif defined(GGML_USE_MUSA)
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
info.devices[id].warp_size = 32;
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
info.devices[id].cc += prop.minor * 0x10;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@@ -4976,9 +4992,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_LEAKY_RELU:
case GGML_OP_RWKV_WKV6:
case GGML_OP_GATED_LINEAR_ATTN:
case GGML_OP_GATED_DELTA_NET:
case GGML_OP_RWKV_WKV7:
return true;
case GGML_OP_GATED_DELTA_NET:
//TODO: enable once MUSA compiler is solved https://github.com/ggml-org/llama.cpp/pull/19504#issuecomment-4018634327
#ifdef GGML_USE_MUSA
return false;
#else
return true;
#endif // GGML_USE_MUSA
case GGML_OP_FLASH_ATTN_EXT:
return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
case GGML_OP_CROSS_ENTROPY_LOSS:
+20 -3
View File
@@ -1717,12 +1717,29 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale(ggml_met
char base[256];
char name[256];
snprintf(base, 256, "kernel_upscale_%s", ggml_type_name(op->src[0]->type));
snprintf(name, 256, "%s", base);
const int32_t mode_flags = ggml_get_op_params_i32(op, 0);
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
if (mode == GGML_SCALE_MODE_BILINEAR) {
snprintf(base, 256, "kernel_upscale_bilinear_%s", ggml_type_name(op->src[0]->type));
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
snprintf(base, 256, "kernel_upscale_bicubic_%s", ggml_type_name(op->src[0]->type));
} else {
snprintf(base, 256, "kernel_upscale_nearest_%s", ggml_type_name(op->src[0]->type));
}
snprintf(name, 256, "%s_aa=%d", base, antialias);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
ggml_metal_cv_t cv = ggml_metal_cv_init();
ggml_metal_cv_set_bool(cv, antialias, FC_UPSCALE + 0);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
ggml_metal_cv_free(cv);
}
return res;
+1 -1
View File
@@ -1108,7 +1108,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
op->type == GGML_TYPE_F32 &&
(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_POOL_1D:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_POOL_2D:
+2
View File
@@ -83,6 +83,7 @@
#define FC_UNARY 1200
#define FC_BIN 1300
#define FC_SUM_ROWS 1400
#define FC_UPSCALE 1500
// op-specific constants
#define OP_FLASH_ATTN_EXT_NQPSG 8
@@ -890,6 +891,7 @@ typedef struct {
float sf1;
float sf2;
float sf3;
float poffs;
} ggml_metal_kargs_upscale;
typedef struct {
+38 -24
View File
@@ -1963,6 +1963,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
(
op->src[0]->type == GGML_TYPE_F32 || // TODO: helper function
op->src[0]->type == GGML_TYPE_F16 ||
op->src[0]->type == GGML_TYPE_BF16 ||
op->src[0]->type == GGML_TYPE_Q4_0 ||
op->src[0]->type == GGML_TYPE_Q4_1 ||
op->src[0]->type == GGML_TYPE_Q5_0 ||
@@ -1977,6 +1978,8 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
op->src[0]->type == GGML_TYPE_Q4_K ||
op->src[0]->type == GGML_TYPE_Q5_K ||
op->src[0]->type == GGML_TYPE_Q6_K ||
op->src[0]->type == GGML_TYPE_Q2_K ||
op->src[0]->type == GGML_TYPE_Q3_K ||
false) && (ne11 >= 4 && ne11 <= 8)
)
)
@@ -3729,32 +3732,43 @@ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
const float sf0 = (float)ne0/op->src[0]->ne[0];
const float sf1 = (float)ne1/op->src[0]->ne[1];
const float sf2 = (float)ne2/op->src[0]->ne[2];
const float sf3 = (float)ne3/op->src[0]->ne[3];
float sf0 = (float)ne0/op->src[0]->ne[0];
float sf1 = (float)ne1/op->src[0]->ne[1];
float sf2 = (float)ne2/op->src[0]->ne[2];
float sf3 = (float)ne3/op->src[0]->ne[3];
const int32_t mode_flags = ggml_get_op_params_i32(op, 0);
float poffs = 0.5f;
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
poffs = 0.0f;
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
}
ggml_metal_kargs_upscale args = {
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.ne03 =*/ ne03,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne0 =*/ ne0,
/*.ne1 =*/ ne1,
/*.ne2 =*/ ne2,
/*.ne3 =*/ ne3,
/*.nb0 =*/ nb0,
/*.nb1 =*/ nb1,
/*.nb2 =*/ nb2,
/*.nb3 =*/ nb3,
/*.sf0 =*/ sf0,
/*.sf1 =*/ sf1,
/*.sf2 =*/ sf2,
/*.sf3 =*/ sf3
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.ne03 =*/ ne03,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne0 =*/ ne0,
/*.ne1 =*/ ne1,
/*.ne2 =*/ ne2,
/*.ne3 =*/ ne3,
/*.nb0 =*/ nb0,
/*.nb1 =*/ nb1,
/*.nb2 =*/ nb2,
/*.nb3 =*/ nb3,
/*.sf0 =*/ sf0,
/*.sf1 =*/ sf1,
/*.sf2 =*/ sf2,
/*.sf3 =*/ sf3,
/*.poffs =*/ poffs,
};
auto pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
+170 -1
View File
@@ -3481,6 +3481,13 @@ template [[host_name("kernel_mul_mv_ext_f16_f32_r1_3")]] kernel mul_mv_ext_q4
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4, 4, dequantize_f16_t4>;
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4, 4, dequantize_f16_t4>;
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, bfloat4, 4, dequantize_bf16_t4>;
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, bfloat4, 4, dequantize_bf16_t4>;
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, bfloat4, 4, dequantize_bf16_t4>;
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, bfloat4, 4, dequantize_bf16_t4>;
#endif
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>;
@@ -3531,6 +3538,16 @@ template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>;
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q2_K, 256, dequantize_q2_K>;
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q2_K, 256, dequantize_q2_K>;
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q2_K, 256, dequantize_q2_K>;
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q2_K, 256, dequantize_q2_K>;
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q3_K, 256, dequantize_q3_K>;
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q3_K, 256, dequantize_q3_K>;
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q3_K, 256, dequantize_q3_K>;
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q3_K, 256, dequantize_q3_K>;
template<typename T0, typename T1, short NR0, typename args_t>
void kernel_mul_mv_t_t_impl(
args_t args,
@@ -4530,7 +4547,9 @@ kernel void kernel_conv_transpose_2d<half>(
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]);
kernel void kernel_upscale_f32(
constant bool FC_upscale_aa [[function_constant(FC_UPSCALE + 0)]];
kernel void kernel_upscale_nearest_f32(
constant ggml_metal_kargs_upscale & args,
device const char * src0,
device char * dst,
@@ -4556,6 +4575,156 @@ kernel void kernel_upscale_f32(
}
}
static inline float bilinear_tri(float x) {
return MAX(0.0f, 1.0f - fabs(x));
}
kernel void kernel_upscale_bilinear_f32(
constant ggml_metal_kargs_upscale & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3 / args.sf3;
const int64_t i02 = i2 / args.sf2;
const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs;
const int64_t i01 = MAX(0, MIN(args.ne01 - 1, (int64_t)floor(f01)));
const int64_t i01p = MAX(0, MIN(args.ne01 - 1, i01 + 1));
const float fd1 = MAX(0.0f, MIN(1.0f, f01 - (float)i01));
src0 += i03*args.nb03 + i02*args.nb02;
device float * dst_ptr = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1);
if (FC_upscale_aa) {
const float support0 = MAX(1.0f, 1.0f / args.sf0);
const float invscale0 = 1.0f / support0;
const float support1 = MAX(1.0f, 1.0f / args.sf1);
const float invscale1 = 1.0f / support1;
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
int64_t x_min = MAX((int64_t)0, (int64_t)floor(f00 - support0 + args.poffs));
int64_t x_max = MIN(args.ne00, (int64_t)ceil (f00 + support0 + args.poffs));
int64_t y_min = MAX((int64_t)0, (int64_t)floor(f01 - support1 + args.poffs));
int64_t y_max = MIN(args.ne01, (int64_t)ceil (f01 + support1 + args.poffs));
float sum = 0.0f;
float wsum = 0.0f;
for (int64_t sy = y_min; sy < y_max; ++sy) {
const float wy = MAX(0.0f, 1.0f - fabs((float)sy - f01) * invscale1);
for (int64_t sx = x_min; sx < x_max; ++sx) {
const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0);
const float w = wx * wy;
const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00);
sum += (*src_ptr) * w;
wsum += w;
}
}
const float v = (wsum > 0.0f) ? (sum / wsum) : 0.0f;
dst_ptr[i0] = v;
}
} else {
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
const int64_t i00 = MAX(0, MIN(args.ne00 - 1, (int64_t)floor(f00)));
const int64_t i00p = MAX(0, MIN(args.ne00 - 1, i00 + 1));
const float fd0 = MAX(0.0f, MIN(1.0f, f00 - (float)i00));
device const float * src00 = (device const float *)(src0 + i01*args.nb01 + i00*args.nb00);
device const float * src10 = (device const float *)(src0 + i01*args.nb01 + i00p*args.nb00);
device const float * src01 = (device const float *)(src0 + i01p*args.nb01 + i00*args.nb00);
device const float * src11 = (device const float *)(src0 + i01p*args.nb01 + i00p*args.nb00);
const float v =
(*src00) * (1.0f - fd0) * (1.0f - fd1) +
(*src10) * fd0 * (1.0f - fd1) +
(*src01) * (1.0f - fd0) * fd1 +
(*src11) * fd0 * fd1;
dst_ptr[i0] = v;
}
}
}
static inline float bicubic_weight1(float x) {
const float a = -0.75f;
return ((a + 2) * x - (a + 3)) * x * x + 1;
}
static inline float bicubic_weight2(float x) {
const float a = -0.75f;
return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a;
}
kernel void kernel_upscale_bicubic_f32(
constant ggml_metal_kargs_upscale & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3 / args.sf3;
const int64_t i02 = i2 / args.sf2;
const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs;
const int64_t i01 = (int64_t)floor(f01);
const float fd1 = f01 - (float)i01;
const float w_y0 = bicubic_weight2(fd1 + 1.0f);
const float w_y1 = bicubic_weight1(fd1);
const float w_y2 = bicubic_weight1(1.0f - fd1);
const float w_y3 = bicubic_weight2(2.0f - fd1);
const device const char * src_slice = src0 + i03 * args.nb03 + i02 * args.nb02;
device float * dst_ptr = (device float *)(dst + i3 * args.nb3 + i2 * args.nb2 + i1 * args.nb1);
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
const int64_t i00 = (int64_t)floor(f00);
const float fd0 = f00 - (float)i00;
const float w_x0 = bicubic_weight2(fd0 + 1.0f);
const float w_x1 = bicubic_weight1(fd0);
const float w_x2 = bicubic_weight1(1.0f - fd0);
const float w_x3 = bicubic_weight2(2.0f - fd0);
float sum = 0.0f;
for (int dy = -1; dy <= 2; ++dy) {
const int64_t iy = MAX(0, MIN(args.ne01 - 1, i01 + dy));
const float wy = (dy == -1) ? w_y0 : (dy == 0) ? w_y1 : (dy == 1) ? w_y2 : w_y3;
for (int dx = -1; dx <= 2; ++dx) {
const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx));
const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3;
const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00);
sum += (*src_ptr) * wx * wy;
}
}
dst_ptr[i0] = sum;
}
}
kernel void kernel_pad_f32(
constant ggml_metal_kargs_pad & args,
device const char * src0,
+39 -1
View File
@@ -763,6 +763,7 @@ struct vk_device_struct {
vk_pipeline pipeline_ceil[2];
vk_pipeline pipeline_floor[2];
vk_pipeline pipeline_trunc[2];
vk_pipeline pipeline_sgn[2];
vk_pipeline pipeline_add1_f16_f16;
vk_pipeline pipeline_add1_f16_f32;
@@ -4393,6 +4394,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_UNARY(ceil)
CREATE_UNARY(floor)
CREATE_UNARY(trunc)
CREATE_UNARY(sgn)
#undef CREATE_UNARY
#define CREATE_UNARY_RTE(name) \
@@ -9281,6 +9283,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_floor[dst->type == GGML_TYPE_F16];
case GGML_UNARY_OP_TRUNC:
return ctx->device->pipeline_trunc[dst->type == GGML_TYPE_F16];
case GGML_UNARY_OP_SGN:
return ctx->device->pipeline_sgn[dst->type == GGML_TYPE_F16];
default:
break;
}
@@ -12875,6 +12879,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_TRUNC:
case GGML_UNARY_OP_SGN:
ggml_vk_unary(ctx, compute_ctx, src0, node);
break;
case GGML_UNARY_OP_XIELU:
@@ -13253,6 +13258,10 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
vk_buffer buf = buf_ctx->dev_buffer;
if (size == 0) {
return;
}
uint32_t val32 = (uint32_t)value * 0x01010101;
ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size);
}
@@ -13262,6 +13271,10 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
vk_buffer buf = buf_ctx->dev_buffer;
if (size == 0) {
return;
}
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
}
@@ -13269,12 +13282,20 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
if (size == 0) {
return;
}
vk_buffer buf = buf_ctx->dev_buffer;
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
}
static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
if (ggml_nbytes(src) == 0) {
return true;
}
if (ggml_backend_buffer_is_vk(src->buffer)) {
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
@@ -13464,6 +13485,10 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
if (size == 0) {
return;
}
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
vk_context cpy_ctx;
@@ -13507,6 +13532,10 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
if (size == 0) {
return;
}
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
@@ -13533,9 +13562,14 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
}
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
// Skip zero-size tensors
if (ggml_nbytes(src) == 0) {
return true;
}
if (dst->buffer->buft != ggml_backend_vk_get_default_buffer_type(backend_dst)) {
return false;
}
@@ -14975,6 +15009,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_TRUNC:
case GGML_UNARY_OP_SGN:
return ggml_is_contiguous(op->src[0]) &&
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
@@ -16141,6 +16176,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
case GGML_UNARY_OP_TRUNC:
tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]);
break;
case GGML_UNARY_OP_SGN:
tensor_clone = ggml_sgn(ggml_ctx, src_clone[0]);
break;
default:
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
GGML_ABORT("fatal error");
@@ -0,0 +1,21 @@
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
data_d[i] = D_TYPE(sign(float(data_a[i])));
}
@@ -871,6 +871,8 @@ void process_shaders() {
string_to_spv("elu_f32", "elu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("xielu_f16", "xielu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("xielu_f32", "xielu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("sgn_f16", "sgn.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("sgn_f32", "sgn.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+2 -12
View File
@@ -6,7 +6,7 @@
{%- set messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "You can use the following tools: <|tool_list_start|>[" -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
{%- for tool in tools -%}
{%- if tool is not string -%}
{%- set tool = tool | tojson -%}
@@ -17,7 +17,6 @@
{%- endif -%}
{%- endfor -%}
{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
{{- '**IMPORTANT**: The syntax for calling the tools is: <|tool_call_start|>JSON tool call goes here<|tool_call_end|>. Please only call tools in the specified manner.' -}}
{%- endif -%}
{%- if ns.system_prompt -%}
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
@@ -30,18 +29,9 @@
{%- endif -%}
{%- if message["role"] == "tool" -%}
{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
{%- elif message["role"] == "assistant" -%}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '\n<|tool_call_start|>\n{"name": "' + tool_call.name + '", "arguments": ' + (tool_call.arguments if tool_call.arguments is string else tool_call.arguments | tojson) + '}\n<|tool_call_end|>\n' }}
{%- endfor %}
{%- endif %}
{%- endif -%}
{{- content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}
{%- endif -%}
-37
View File
@@ -1,37 +0,0 @@
{{- bos_token -}}
{%- set system_prompt = "" -%}
{%- set ns = namespace(system_prompt="") -%}
{%- if messages[0]["role"] == "system" -%}
{%- set ns.system_prompt = messages[0]["content"] -%}
{%- set messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
{%- for tool in tools -%}
{%- if tool is not string -%}
{%- set tool = tool | tojson -%}
{%- endif -%}
{%- set ns.system_prompt = ns.system_prompt + tool -%}
{%- if not loop.last -%}
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
{%- endif -%}
{%- endfor -%}
{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
{%- endif -%}
{%- if ns.system_prompt -%}
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
{%- endif -%}
{%- for message in messages -%}
{{- "<|im_start|>" + message["role"] + "\n" -}}
{%- set content = message["content"] -%}
{%- if content is not string -%}
{%- set content = content | tojson -%}
{%- endif -%}
{%- if message["role"] == "tool" -%}
{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
{%- endif -%}
{{- content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}
+1 -1
View File
@@ -778,7 +778,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
ml.load_data_for(tensor);
}
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),
+1 -2
View File
@@ -155,7 +155,6 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
const auto kv_head = mctx_cur->get_head();
const int64_t n_embd = hparams.n_embd;
const int64_t d_conv = hparams.ssm_d_conv;
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t d_state = hparams.ssm_d_state;
@@ -170,7 +169,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
GGML_ASSERT(d_inner % n_head == 0);
GGML_ASSERT(d_inner % (n_group*n_embd) == 0);
GGML_ASSERT(d_inner % (n_group*d_state) == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+111 -252
View File
@@ -800,258 +800,6 @@ const common_chat_msg message_assist_call_python_lines_unclosed =
const common_chat_msg message_assist_json_content =
simple_assist_msg("{\n \"response\": \"Hello, world!\\nWhat's up?\"\n}");
struct delta_data {
std::string delta;
common_chat_params params;
};
static delta_data init_delta(const struct common_chat_templates * tmpls,
const std::vector<std::string> & end_tokens,
const common_chat_msg & user_message,
const common_chat_msg & delta_message,
const std::vector<common_chat_tool> & tools,
const common_chat_tool_choice & tool_choice) {
common_chat_templates_inputs inputs;
inputs.parallel_tool_calls = true;
inputs.messages.push_back(user_message);
inputs.tools = tools;
inputs.tool_choice = tool_choice;
auto params_prefix = common_chat_templates_apply(tmpls, inputs);
inputs.messages.push_back(delta_message);
inputs.add_generation_prompt = false;
auto params_full = common_chat_templates_apply(tmpls, inputs);
std::string prefix = params_prefix.prompt;
std::string full = params_full.prompt;
if (full == prefix) {
throw std::runtime_error("Full message is the same as the prefix");
}
size_t common_prefix_length = 0;
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
if (prefix[i] != full[i]) {
break;
}
if (prefix[i] == '<') {
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
// but it removes thinking tags for past messages.
// The prefix and full strings diverge at <think> vs. <tool▁calls▁begin>, we avoid consuming the leading <.
continue;
}
common_prefix_length = i + 1;
}
auto delta = full.substr(common_prefix_length);
// Strip end tokens
for (const auto & end_token : end_tokens) {
// rfind to find the last occurrence
auto pos = delta.rfind(end_token);
if (pos != std::string::npos) {
delta = delta.substr(0, pos);
break;
}
}
return { delta, params_full };
}
/*
Applies the template to 1 user message w/ add_generation_prompt=true, then w/ the test message w/ add_generation_prompt=false,
gets the diff, removes any end tokens and parses the result w/ the grammar, checking that
the parsed message is the same as the test_message
*/
static void test_templates(const struct common_chat_templates * tmpls,
const std::vector<std::string> & end_tokens,
const common_chat_msg & test_message,
const std::vector<common_chat_tool> & tools = {},
const std::string & expected_delta = "",
bool expect_grammar_triggered = true,
bool test_grammar_if_triggered = true,
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE,
bool ignore_whitespace_differences = false) {
common_chat_msg user_message;
user_message.role = "user";
user_message.content = "Hello, world!";
common_chat_templates_inputs inputs_tools;
inputs_tools.messages = { message_user };
inputs_tools.tools = { special_function_tool };
common_chat_params params = common_chat_templates_apply(tmpls, inputs_tools);
for (const auto & tool_choice :
std::vector<common_chat_tool_choice>{ COMMON_CHAT_TOOL_CHOICE_AUTO, COMMON_CHAT_TOOL_CHOICE_REQUIRED }) {
auto data = init_delta(tmpls, end_tokens, user_message, test_message, tools, tool_choice);
if (!expected_delta.empty()) {
if (ignore_whitespace_differences) {
assert_equals(string_strip(expected_delta), string_strip(data.delta));
} else {
assert_equals(expected_delta, data.delta);
}
}
if (expect_grammar_triggered) {
// TODO @ngxson : refactor common_chat_parse to avoid passing format/reasoning_format every time
common_chat_parser_params parser_params;
parser_params.format = data.params.format;
parser_params.reasoning_format = reasoning_format;
if (!parser_params.parser.empty()) {
parser_params.parser = common_peg_arena();
parser_params.parser.load(params.parser);
}
const auto msg = common_chat_parse(data.delta, /* is_partial= */ false, parser_params);
assert_msg_equals(test_message, msg, ignore_whitespace_differences);
}
if (!test_message.tool_calls.empty()) {
GGML_ASSERT(!data.params.grammar.empty());
}
if (!data.params.grammar.empty()) {
auto grammar = build_grammar(data.params.grammar);
if (!grammar) {
throw std::runtime_error("Failed to build grammar");
}
auto earliest_trigger_pos = std::string::npos;
auto constrained = data.delta;
for (const auto & trigger : data.params.grammar_triggers) {
size_t pos = std::string::npos;
std::smatch match;
switch (trigger.type) {
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
{
const auto & word = trigger.value;
pos = constrained.find(word);
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
{
const auto & pattern = std::regex(trigger.value);
if (std::regex_search(constrained, match, pattern)) {
pos = match.position(pattern.mark_count());
}
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
{
const auto & pattern = trigger.value;
if (std::regex_match(constrained, match, std::regex(pattern))) {
auto mpos = std::string::npos;
for (size_t i = 1; i < match.size(); ++i) {
if (match[i].length() > 0) {
mpos = match.position(i);
break;
}
}
if (mpos == std::string::npos) {
mpos = match.position(0);
}
pos = mpos;
}
break;
}
default:
throw std::runtime_error("Unknown trigger type");
}
if (pos == std::string::npos) {
continue;
}
if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) {
earliest_trigger_pos = pos;
}
}
auto grammar_triggered = false;
if (earliest_trigger_pos != std::string::npos) {
constrained = constrained.substr(earliest_trigger_pos);
grammar_triggered = true;
}
if (data.params.grammar_lazy) {
assert_equals(expect_grammar_triggered, grammar_triggered);
}
if (grammar_triggered && test_grammar_if_triggered && !match_string(constrained, grammar.get())) {
throw std::runtime_error("Failed to match delta against grammar:\n\n" + data.delta +
"\n\nConstrained: " + constrained + "\n\nGrammar: " + data.params.grammar);
}
}
}
}
/**
* Test if streaming=true is consistent with streaming=false for given partial parser
* Also test if there is any problem with partial message
*/
template <typename T>
static void test_parser_with_streaming(const common_chat_msg & expected, const std::string & raw_message, T parse_msg) {
constexpr auto utf8_truncate_safe_len = [](const std::string_view s) -> size_t {
auto len = s.size();
if (len == 0) {
return 0;
}
auto i = len;
for (size_t back = 0; back < 4 && i > 0; ++back) {
--i;
unsigned char c = s[i];
if ((c & 0x80) == 0) {
return len;
}
if ((c & 0xC0) == 0xC0) {
size_t expected_len = 0;
if ((c & 0xE0) == 0xC0) {
expected_len = 2;
} else if ((c & 0xF0) == 0xE0) {
expected_len = 3;
} else if ((c & 0xF8) == 0xF0) {
expected_len = 4;
} else {
return i;
}
if (len - i >= expected_len) {
return len;
}
return i;
}
}
return len - std::min(len, size_t(3));
};
constexpr auto utf8_truncate_safe_view = [utf8_truncate_safe_len](const std::string_view s) {
return s.substr(0, utf8_truncate_safe_len(s));
};
auto merged = simple_assist_msg("");
auto last_msg = parse_msg("");
for (size_t i = 1; i <= raw_message.size(); ++i) {
auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i))));
if (curr_msg == simple_assist_msg("")) {
continue;
}
LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({ curr_msg }).dump().c_str());
for (auto diff : common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) {
LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat(diff).dump().c_str());
if (!diff.reasoning_content_delta.empty()) {
merged.reasoning_content += diff.reasoning_content_delta;
}
if (!diff.content_delta.empty()) {
merged.content += diff.content_delta;
}
if (diff.tool_call_index != std::string::npos) {
if (!diff.tool_call_delta.name.empty()) {
merged.tool_calls.push_back({ diff.tool_call_delta.name, "", "" });
}
if (!diff.tool_call_delta.arguments.empty()) {
GGML_ASSERT(!merged.tool_calls.empty());
merged.tool_calls.back().arguments += diff.tool_call_delta.arguments;
}
}
LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat({ merged }).dump().c_str());
}
assert_msg_equals(curr_msg, merged, true);
last_msg = curr_msg;
}
assert_msg_equals(expected, parse_msg(raw_message), true);
assert_msg_equals(expected, merged, true);
}
// Use for PEG parser implementations
struct peg_test_case {
common_chat_templates_inputs params;
@@ -2387,6 +2135,78 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
}
// LFM2-8B-A1B tests - uses <|tool_list_start|>/<|tool_list_end|> and <|tool_call_start|>[name(args)]<|tool_call_end|>
{
auto tst = peg_tester("models/templates/LFM2-8B-A1B.jinja", detailed_debug);
// Basic content only
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
// Single tool call without reasoning
tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Tool call with string argument
tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
// Tool call with reasoning (enable_thinking=true)
tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// Multiple tool calls (parallel)
tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
})
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
})
.run();
// Tool call with reasoning and content
tst.test("<think>I need to call a function</think>"
"Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ get_time_tool })
.expect(message_with_reasoning_content_and_multiple_tool_calls(
"I need to call a function", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
))
.run();
// Python tool with multiline code in string
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
})
.run();
// Partial tool call (streaming)
tst.test("<|tool_call_start|>[special_function(arg1=")
.tools({ special_function_tool })
.is_partial(true)
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
.run();
// Tool call with empty arguments
tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();
}
// Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
// Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|>
{
@@ -2947,6 +2767,44 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
}
}
// Test the developer role to system workaround with a simple mock template
static void test_developer_role_to_system_workaround() {
LOG_DBG("%s\n", __func__);
// Simple mock template that supports system role
const std::string mock_template =
"{%- for message in messages -%}\n"
" {{- '<|' + message.role + '|>' + message.content + '<|end|>' -}}\n"
"{%- endfor -%}\n"
"{%- if add_generation_prompt -%}\n"
" {{- '<|assistant|>' -}}\n"
"{%- endif -%}";
auto tmpls = common_chat_templates_ptr(common_chat_templates_init(/* model= */ nullptr, mock_template));
// Test case 1: Developer message - should be changed to system
// After simplification we only test this case
{
common_chat_templates_inputs inputs;
common_chat_msg developer_msg;
developer_msg.role = "developer";
developer_msg.content = "You are a helpful developer assistant.";
inputs.messages = { developer_msg };
inputs.add_generation_prompt = false;
auto params = common_chat_templates_apply(tmpls.get(), inputs);
// The developer role should have been changed to system
if (params.prompt.find("<|developer|>") != std::string::npos) {
throw std::runtime_error("Test failed: developer role was not changed to system");
}
if (params.prompt.find("<|system|>You are a helpful developer assistant.<|end|>") == std::string::npos) {
throw std::runtime_error("Test failed: system message not found in output");
}
LOG_ERR("Test 1 passed: developer role changed to system\n");
}
}
static void test_msg_diffs_compute() {
LOG_DBG("%s\n", __func__);
{
@@ -3083,6 +2941,7 @@ int main(int argc, char ** argv) {
test_msg_diffs_compute();
test_msgs_oaicompat_json_conversion();
test_tools_oaicompat_json_conversion();
test_developer_role_to_system_workaround();
test_template_output_peg_parsers(detailed_debug);
std::cout << "\n[chat] All tests passed!" << '\n';
}
+116 -64
View File
@@ -20,6 +20,7 @@
#include <unordered_set>
#include "common.h"
#include "download.h"
#include "ggml.h"
#include "llama.h"
@@ -312,6 +313,9 @@ static std::vector<int> parse_int_range(const std::string & s) {
struct cmd_params {
std::vector<std::string> model;
std::vector<std::string> hf_repo;
std::vector<std::string> hf_file;
std::string hf_token;
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
@@ -351,6 +355,9 @@ struct cmd_params {
static const cmd_params cmd_params_defaults = {
/* model */ { "models/7B/ggml-model-q4_0.gguf" },
/* hf_repo */ {},
/* hf_file */ {},
/* hf_token */ "",
/* n_prompt */ { 512 },
/* n_gen */ { 128 },
/* n_pg */ {},
@@ -372,7 +379,7 @@ static const cmd_params cmd_params_defaults = {
/* devices */ { {} },
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
/* use_mmap */ { false },
/* use_mmap */ { true },
/* use_direct_io */ { false },
/* embeddings */ { false },
/* no_op_offload */ { false },
@@ -393,74 +400,57 @@ static void print_usage(int /* argc */, char ** argv) {
printf("\n");
printf("options:\n");
printf(" -h, --help\n");
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
cmd_params_defaults.reps);
printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n",
cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
output_format_str(cmd_params_defaults.output_format_stderr));
printf(" --list-devices list available devices and exit\n");
printf(" -v, --verbose verbose output\n");
printf(" --progress print test progress indicators\n");
printf(" --no-warmup skip warmup runs before benchmarking\n");
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps);
printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" --list-devices list available devices and exit\n");
printf(" -v, --verbose verbose output\n");
printf(" --progress print test progress indicators\n");
printf(" --no-warmup skip warmup runs before benchmarking\n");
if (llama_supports_rpc()) {
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
}
printf("\n");
printf("test parameters:\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n",
join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -d, --n-depth <n> (default: %s)\n",
join(cmd_params_defaults.n_depth, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n",
join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n",
join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk, --cache-type-k <t> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n",
join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n",
join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n",
join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
printf(" -mmp, --mmap <0|1> (default: %s)\n",
join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -dio, --direct-io <0|1> (default: %s)\n",
join(cmd_params_defaults.use_direct_io, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n",
join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive\n");
printf(" default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n");
printf(" example: unsloth/phi-4-GGUF:Q4_K_M\n");
printf(" (default: unused)\n");
printf(" -hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo\n");
printf(" (default: unused)\n");
printf(" -hft, --hf-token <token> Hugging Face access token\n");
printf(" (default: value from HF_TOKEN environment variable)\n");
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n", join(cmd_params_defaults.n_cpu_moe, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
printf(" (default: disabled)\n");
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
printf(" --no-host <0|1> (default: %s)\n",
join(cmd_params_defaults.no_host, ",").c_str());
printf(" (default: disabled)\n");
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
printf(" --no-host <0|1> (default: %s)\n", join(cmd_params_defaults.no_host, ",").c_str());
printf("\n");
printf(
"Multiple values can be given for each parameter by separating them with ','\n"
@@ -514,6 +504,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.progress = cmd_params_defaults.progress;
params.no_warmup = cmd_params_defaults.no_warmup;
if (const char * env = getenv("HF_TOKEN")) {
params.hf_token = env;
}
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
@@ -531,6 +525,26 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-hf" || arg == "-hfr" || arg == "--hf-repo") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.hf_repo.insert(params.hf_repo.end(), p.begin(), p.end());
} else if (arg == "-hff" || arg == "--hf-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.hf_file.insert(params.hf_file.end(), p.begin(), p.end());
} else if (arg == "-hft" || arg == "--hf-token") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.hf_token = argv[i];
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -961,6 +975,44 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
exit(1);
}
if (!params.hf_repo.empty()) {
for (size_t i = 0; i < params.hf_repo.size(); i++) {
common_params_model model;
// step 1: no `-hff` provided, we auto-detect based on the `-hf` flag
if (params.hf_file.empty() || params.hf_file[i].empty()) {
auto auto_detected = common_get_hf_file(params.hf_repo[i], params.hf_token, false);
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1);
}
model.name = params.hf_repo[i];
model.hf_repo = auto_detected.repo;
model.hf_file = auto_detected.ggufFile;
} else {
model.hf_file = params.hf_file[i];
}
// step 2: construct the model cache path
std::string clean_fname = model.hf_repo + "_" + model.hf_file;
string_replace_all(clean_fname, "\\", "_");
string_replace_all(clean_fname, "/", "_");
model.path = fs_get_cache_file(clean_fname);
// step 3: download the model if not exists
std::string model_endpoint = get_model_endpoint();
model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
bool ok = common_download_model(model, params.hf_token, false);
if (!ok) {
fprintf(stderr, "error: failed to download model from %s\n", model.url.c_str());
exit(1);
}
params.model.push_back(model.path);
}
}
// set defaults
if (params.model.empty()) {
params.model = cmd_params_defaults.model;
+2 -2
View File
@@ -276,7 +276,7 @@ llama_pos server_tokens::pos_next(int64_t n_tokens) const {
size_t server_tokens::size_up_to_pos(llama_pos max_pos) const {
if (!has_mtmd) {
return std::min((size_t)(max_pos + 1), tokens.size());
return std::min((size_t)max_pos, tokens.size());
}
size_t idx = 0;
@@ -296,7 +296,7 @@ size_t server_tokens::size_up_to_pos(llama_pos max_pos) const {
idx++;
}
if (pos > max_pos) {
if (pos >= max_pos) {
break;
}
}
+1 -1
View File
@@ -170,7 +170,7 @@ public:
// the next position after n_tokens. if n_tokens < 0, return the next position after all tokens.
llama_pos pos_next(int64_t n_tokens = -1) const;
// number of tokens with position <= max_pos
// number of tokens with position < max_pos
size_t size_up_to_pos(llama_pos max_pos) const;
const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
+23 -3
View File
@@ -562,7 +562,7 @@ private:
llama_model_ptr model_dft;
bool add_bos_token = true;
bool add_bos_token = true;
int32_t n_ctx; // total context for all clients / slots
@@ -570,6 +570,7 @@ private:
std::vector<server_slot> slots;
int slots_debug = 0;
int n_empty_consecutive = 0;
std::unique_ptr<server_prompt_cache> prompt_cache;
@@ -728,6 +729,13 @@ private:
}
}
if (llama_model_n_swa(model) == 0) {
if (params_base.swa_full) {
params_base.swa_full = false;
SRV_WRN("%s\n", "swa_full is not supported by this model, it will be disabled");
}
}
// Necessary similarity of prompt for slot selection
slot_prompt_similarity = params_base.slot_prompt_similarity;
@@ -2133,6 +2141,9 @@ private:
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
const auto & input_tokens = slot.task->tokens;
// used to determine the number of tokens added to the batch for the current slot
const auto n_tokens_prev = batch.n_tokens;
// TODO: maybe move branch to outside of this loop in the future
if (slot.state == SLOT_STATE_STARTED) {
slot.t_start_process_prompt = ggml_time_us();
@@ -2371,7 +2382,7 @@ private:
} else {
pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);
SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024);
SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_past = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, n_past, (float) checkpoint_size / 1024 / 1024);
}
}
@@ -2525,6 +2536,9 @@ private:
}
}
// the number of tokens added to the batch for the current slot
const auto n_tokens_cur = batch.n_tokens - n_tokens_prev;
// entire prompt has been processed
if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
slot.state = SLOT_STATE_DONE_PROMPT;
@@ -2585,7 +2599,7 @@ private:
auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
/*.pos_min = */ pos_min,
/*.pos_max = */ pos_max,
/*.n_tokens = */ slot.prompt.n_tokens() - batch.n_tokens,
/*.n_tokens = */ slot.prompt.n_tokens() - n_tokens_cur,
/*.data = */ std::vector<uint8_t>(checkpoint_size),
});
@@ -2628,6 +2642,12 @@ private:
if (batch.n_tokens == 0) {
SRV_WRN("%s", "no tokens to decode\n");
if (++n_empty_consecutive > 3) {
GGML_ABORT("fatal error - please provide logs and repro in %s\n", "https://github.com/ggml-org/llama.cpp/pull/20277");
}
} else {
n_empty_consecutive = 0;
}
int32_t i_next = 0;
+3 -2
View File
@@ -30,12 +30,13 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
throw std::runtime_error("unsupported URL scheme in target URL: " + parsed_url.scheme);
}
SRV_INF("proxying %s request to %s://%s%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.path.c_str());
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
auto proxy = std::make_unique<server_http_proxy>(
method,
parsed_url.scheme,
parsed_url.host,
parsed_url.scheme == "http" ? 80 : 443,
parsed_url.port,
parsed_url.path,
req.headers,
req.body,
+3 -1
View File
@@ -783,6 +783,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
}
auto proxy = std::make_unique<server_http_proxy>(
method,
"http",
CHILD_ADDR,
meta->port,
proxy_path,
@@ -1079,6 +1080,7 @@ static bool should_strip_proxy_header(const std::string & header_name) {
server_http_proxy::server_http_proxy(
const std::string & method,
const std::string & scheme,
const std::string & host,
int port,
const std::string & path,
@@ -1092,7 +1094,7 @@ server_http_proxy::server_http_proxy(
auto cli = std::make_shared<httplib::ClientImpl>(host, port);
auto pipe = std::make_shared<pipe_t<msg_t>>();
if (port == 443) {
if (scheme == "https") {
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
cli.reset(new httplib::SSLClient(host, port));
#else
+1
View File
@@ -180,6 +180,7 @@ struct server_http_proxy : server_http_res {
std::function<void()> cleanup = nullptr;
public:
server_http_proxy(const std::string & method,
const std::string & scheme,
const std::string & host,
int port,
const std::string & path,
+41
View File
@@ -0,0 +1,41 @@
import pytest
from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
def test_mcp_no_proxy():
global server
server.webui_mcp_proxy = False
server.start()
res = server.make_request("GET", "/cors-proxy")
assert res.status_code == 404
def test_mcp_proxy():
global server
server.webui_mcp_proxy = True
server.start()
url = f"http://{server.server_host}:{server.server_port}/cors-proxy?url=http://example.com"
res = requests.get(url)
assert res.status_code == 200
assert "Example Domain" in res.text
def test_mcp_proxy_custom_port():
global server
server.webui_mcp_proxy = True
server.start()
# try getting the server's models API via the proxy
res = server.make_request("GET", f"/cors-proxy?url=http://{server.server_host}:{server.server_port}/models")
assert res.status_code == 200
assert "data" in res.body
+3
View File
@@ -102,6 +102,7 @@ class ServerProcess:
mmproj_url: str | None = None
media_path: str | None = None
sleep_idle_seconds: int | None = None
webui_mcp_proxy: bool = False
# session variables
process: subprocess.Popen | None = None
@@ -236,6 +237,8 @@ class ServerProcess:
server_args.extend(["--media-path", self.media_path])
if self.sleep_idle_seconds is not None:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
if self.webui_mcp_proxy:
server_args.append("--webui-mcp-proxy")
args = [str(arg) for arg in [server_path, *server_args]]
print(f"tests: starting server with: {' '.join(args)}")