Compare commits

...

10 Commits

Author SHA1 Message Date
Tarek Dakhran 7dad2f1a17 chat : fix LFM2 tool-call parsing double-escaping (#24667)
* Add escape test cases

* chat : fix LFM2 tool-call parsing double-escaping
2026-06-15 22:10:09 +02:00
Xuan-Son Nguyen e36a602ba3 mtmd: fix miscounting n_tokens (#24656) 2026-06-15 18:07:14 +02:00
Piotr Wilkin (ilintar) 38d546330a chat: include full unparsed prompt in debug (#24650)
message on parse error
2026-06-15 17:33:54 +02:00
Julien Jerphanion a1eb756c0b docs: Add instructions to install llama.cpp from conda-forge (#22219)
* docs: Add instructions to install `llama.cpp` from conda-forge

Signed-off-by: Julien Jerphanion <git@jjerphan.xyz>

* Rewording of instructions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Signed-off-by: Julien Jerphanion <git@jjerphan.xyz>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-06-15 17:12:25 +02:00
Pascal 581e8eca8b chat: harden peg-native tool call parsing (#24329)
* chat: harden peg-native tool call parsing

accept an optional leading type: function field in
build_json_tools_flat_keys so openai style tool calls parse on
templates whose serialization opens on the name field.

return a clean error and log the unparsed fragment on a final peg
parse failure instead of throwing the raw parser position and input.

keep the raw arguments string in func_args_not_string when it is not
valid json instead of aborting the prompt render.

* chat: surface peg-native parse failures

a final peg parse failure threw the raw parser position and input. log
the unparsed fragment and raise a clearer error instead, so a model
output that does not match the expected format no longer fails silently
with an empty assistant turn.

minimal change, no behavior change on successful parses.

* chat: handle openai style tool calls in peg-native

* nits

* common: scope OpenAI wrapper grammar trigger via autoparser flag

* chat: gate type:function parsing leniency on the analysis flag

Thread accept_openai_wrapper from the generator to build_json_tools_flat_keys
so the leading "type": "function" field is accepted only when openai_wrapper_trigger is set.
2026-06-15 15:37:04 +02:00
Piotr Wilkin (ilintar) 0ae3f450f0 chat: fix an "oldie but goodie" grammar generator bug that surfaced during last changes (#24653)
* chat: fix an "oldie but goodie" grammar generator bug that surfaced during last changes

* update erroneous case in PEG parser test
2026-06-15 15:27:47 +02:00
Georgi Gerganov e3cab403bf mtmd : add post-decode callback (#24645)
Assisted-by: pi:llama.cpp/Qwen3.6-27B
2026-06-15 16:02:05 +03:00
Jeff Bolz 9dbc6621ae vulkan: support more CONCAT types (#24579) 2026-06-15 13:19:21 +02:00
Andrei 6eab47181c wasm : fix fallback symbol collision (#24639) 2026-06-15 10:11:59 +03:00
Katostrofik e3bb1add8c SYCL: use native subgroup size for K-quant DMMV (#21700) 2026-06-15 10:10:53 +03:00
22 changed files with 580 additions and 504 deletions
+1 -1
View File
@@ -37,7 +37,7 @@ LLM inference in C/C++
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md)
- Run with Docker - see our [Docker documentation](docs/docker.md)
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
+6 -2
View File
@@ -103,6 +103,10 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
};
if (autoparser.tools.format.openai_wrapper_trigger) {
// model emits the OpenAI function wrapper, trigger on it
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "{\"type\": \"function\"," });
}
}
}
@@ -224,13 +228,13 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
auto single_tool_parser = p.standard_json_tools(
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
} else {
tools_parser = p.standard_json_tools(
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
}
// Handle content wrappers if present
+1
View File
@@ -181,6 +181,7 @@ struct tool_format_analysis {
bool fun_name_is_key = false; // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
bool tools_array_wrapped = false; // Tool calls wrapped in JSON array [...]
bool openai_wrapper_trigger = false; // model emits the OpenAI function wrapper, trigger on it
std::string function_field = "function";
std::string name_field = "name";
+8
View File
@@ -165,6 +165,14 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
}
},
// template uses the JSON {name, parameters} tool instruction, emits the OpenAI function wrapper
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
if (tmpl.src.find("Respond in the format {\"name\": function name") != std::string::npos &&
tmpl.src.find("Do not use variables.") != std::string::npos) {
analysis.tools.format.openai_wrapper_trigger = true;
LOG_DBG(ANSI_ORANGE "[Patch: JSON name/parameters tool instruction]\n" ANSI_RESET);
}
},
});
+17 -8
View File
@@ -540,10 +540,11 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
auto arg_name_parser = literal(prop_name);
common_peg_parser arg_value_parser = eps();
auto string_value_parser = choice({
literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
});
// Quoted literal as a value: normalize_quotes_to_json preserves escapes.
auto string_value_parser = tool_arg_value(choice({
literal("\"") + string_content('"') + literal("\""),
literal("'") + string_content('\'') + literal("'")
}));
if (is_string_type) {
arg_value_parser = string_value_parser;
@@ -745,7 +746,8 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key,
const std::vector<std::string> & parameters_order) {
const std::vector<std::string> & parameters_order,
bool accept_openai_wrapper) {
auto tool_choices = choice();
auto name_key_parser = literal("\"" + effective_name_key + "\"");
@@ -807,7 +809,13 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
return idx_a < idx_b;
});
auto ordered_body = tool_open(literal("{")) + space();
// accept an optional leading "type": "function" field when the model emits the OpenAI wrapper
common_peg_parser type_field = eps();
if (accept_openai_wrapper) {
type_field = optional(literal("\"type\"") + space() + literal(":") + space() +
literal("\"function\"") + space() + literal(",") + space());
}
auto ordered_body = tool_open(literal("{")) + space() + type_field;
for (size_t i = 0; i < parser_pairs.size(); i++) {
ordered_body = ordered_body + parser_pairs[i].first;
if (i < parser_pairs.size() - 1) {
@@ -870,7 +878,8 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
bool function_is_key,
const std::string & call_id_key,
const std::string & gen_call_id_key,
const std::vector<std::string> & parameters_order) {
const std::vector<std::string> & parameters_order,
bool accept_openai_wrapper) {
if (!tools.is_array() || tools.empty()) {
return eps();
}
@@ -888,7 +897,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
if (!name_spec.first.empty() || !args_spec.first.empty()) {
tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
} else {
tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order, accept_openai_wrapper);
}
}
+4 -2
View File
@@ -120,7 +120,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
bool function_is_key = false,
const std::string & call_id_key = "",
const std::string & gen_call_id_key = "",
const std::vector<std::string> & parameters_order = {});
const std::vector<std::string> & parameters_order = {},
bool accept_openai_wrapper = false);
// Legacy-compatible helper for building XML/tagged style tool calls
// Used by tests and manual parsers
@@ -157,7 +158,8 @@ class common_chat_peg_builder : public common_peg_parser_builder {
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key,
const std::vector<std::string> & parameters_order);
const std::vector<std::string> & parameters_order,
bool accept_openai_wrapper);
};
inline common_peg_arena build_chat_peg_parser(
+3 -2
View File
@@ -2678,8 +2678,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
}
return msg;
}
throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
effective_input.substr(result.end));
LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str());
LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str());
throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format");
}
common_chat_msg msg;
+22 -2
View File
@@ -1507,6 +1507,7 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
auto pieces = matcher.collect_prefix_and_next();
std::string pattern;
std::string trailing; // optional proper-prefix of a delimiter, allowed only at the very end
for (size_t i = 0; i < pieces.size(); ++i) {
if (i > 0) {
pattern += " | ";
@@ -1522,13 +1523,32 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
}
if (!pre.empty()) {
pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
pattern += pre_literal + " [^" + cls + "]";
// Each interior alternative consumes a delimiter-prefix plus a disambiguating
// char, so the repetition alone cannot match a value that *ends* on a proper
// prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
// "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
// values, so without this the grammar would reject input the parser accepts.
// Allow the value to terminate on any proper prefix as an optional tail.
// This makes the grammar a slight superset of the runtime language (a value
// may end on the longest prefix, which greedy first-match would not itself
// produce); harmless for constrained generation, which only needs to admit
// every runtime-valid string.
if (!trailing.empty()) {
trailing += " | ";
}
trailing += pre_literal;
} else {
pattern += "[^" + cls + "]";
}
}
return "(" + pattern + ")*";
std::string result = "(" + pattern + ")*";
if (!trailing.empty()) {
result += " (" + trailing + ")?";
}
return result;
}
static std::unordered_set<std::string> collect_reachable_rules(
+30 -2
View File
@@ -1,12 +1,40 @@
# Install pre-built version of llama.cpp
| Install via | Windows | Mac | Linux |
|-------------|---------|-----|-------|
| Install via | Windows | Mac | Linux |
|-------------|---------|------|-------|
| conda-forge | ✅ | ✅ | ✅ |
| Winget | ✅ | | |
| Homebrew | | ✅ | ✅ |
| MacPorts | | ✅ | |
| Nix | | ✅ | ✅ |
## conda-forge (Windows, Mac and Linux)
conda-forge provides builds for:
- CUDA (Windows and Linux)
- Vulkan (Windows and Linux)
- Apple Metal (macOS)
```sh
conda install -c conda-forge llama-cpp
```
```sh
mamba install -c conda-forge llama-cpp
```
```sh
# Project-local installation
pixi add llama-cpp
# Global installation
pixi global install llama-cpp
```
This distribution is managed on [`conda-forge/llama-cpp-feedstock`](https://github.com/conda-forge/llama.cpp-feedstock/).
Shall you have any problems, please open an issue on [its issue tracker](https://github.com/conda-forge/llama.cpp-feedstock/issues).
## Winget (Windows)
```sh
-1
View File
@@ -293,7 +293,6 @@
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__wasm__)
// quants.c
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
File diff suppressed because it is too large Load Diff
+29 -12
View File
@@ -798,7 +798,7 @@ struct vk_device_struct {
vk_pipeline pipeline_add_id_f32;
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
vk_pipeline pipeline_concat_i8, pipeline_concat_i16, pipeline_concat_i32, pipeline_concat_i64;
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bicubic_f32, pipeline_upscale_bilinear_antialias_f32;
vk_pipeline pipeline_scale_f32;
vk_pipeline pipeline_sqr_f32;
@@ -4996,9 +4996,10 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 1}, 1);
ggml_vk_create_pipeline(device, device->pipeline_set_f32, "set_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 0}, 1);
ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_concat_i8, "concat_i8", concat_i8_len, concat_i8_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_concat_i16, "concat_i16", concat_i16_len, concat_i16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_concat_i64, "concat_i64", concat_i64_len, concat_i64_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
@@ -10318,17 +10319,27 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_add_id_f32;
}
return nullptr;
case GGML_OP_CONCAT:
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_concat_f32;
case GGML_OP_CONCAT: {
if (src0->type != src1->type || src0->type != dst->type) {
return nullptr;
}
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_concat_f16;
if (ggml_blck_size(src0->type) != 1) {
return nullptr;
}
if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
const size_t type_size = ggml_type_size(src0->type);
switch (type_size) {
case 1:
return ctx->device->pipeline_concat_i8;
case 2:
return ctx->device->pipeline_concat_i16;
case 4:
return ctx->device->pipeline_concat_i32;
case 8:
return ctx->device->pipeline_concat_i64;
default:
return nullptr;
}
return nullptr;
}
case GGML_OP_UPSCALE:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
uint32_t mode = (ggml_get_op_params_i32(dst, 0) & (0xFF | GGML_SCALE_FLAG_ANTIALIAS));
@@ -17042,8 +17053,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_SET:
return op->src[0]->type == op->src[1]->type && op->src[0]->type == op->type &&
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_I32);
case GGML_OP_CONCAT:
return ggml_type_size(op->src[0]->type) == ggml_type_size(GGML_TYPE_F32);
case GGML_OP_CONCAT: {
if (op->src[0]->type != op->src[1]->type || op->src[0]->type != op->type) {
return false;
}
const size_t type_size = ggml_type_size(op->type);
return ggml_blck_size(op->type) == 1 &&
(type_size == 1 || type_size == 2 || type_size == 4 || type_size == 8);
}
case GGML_OP_ADD1:
return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32)
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32)
@@ -862,9 +862,10 @@ void process_shaders() {
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
string_to_spv("concat_i8", "concat.comp", {{"A_TYPE", "uint8_t"}, {"B_TYPE", "uint8_t"}, {"D_TYPE", "uint8_t"}});
string_to_spv("concat_i16", "concat.comp", {{"A_TYPE", "uint16_t"}, {"B_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "uint"}, {"B_TYPE", "uint"}, {"D_TYPE", "uint"}});
string_to_spv("concat_i64", "concat.comp", {{"A_TYPE", "uvec2"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "uvec2"}});
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+1 -1
View File
@@ -129,7 +129,7 @@ void test_gbnf_generation(testing &t) {
});
assert_gbnf_equal(t, R"""(
root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])* ("<" | "</" | "</t" | "</ta" | "</tag")?
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
+3 -3
View File
@@ -130,12 +130,12 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
}
}
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16) {
// This is going to create some weird integers though.
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
ggml_backend_tensor_set(tensor, data.data(), 0, nels * ggml_type_size(tensor->type));
} else if (tensor->type == GGML_TYPE_I64) {
// Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
const size_t nbytes_half = ggml_nbytes(tensor)/2;
const size_t nbytes_half = nels * sizeof(float);
ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
} else {
+75 -2
View File
@@ -1882,11 +1882,29 @@ static void test_lfm2_parser(const std::string & template_path, bool detailed_de
.expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
.run();
// Python tool with multiline code in string
// Python tool with multiline code in string: the \n in the literal decodes to a real
// newline, emitted as a JSON \n escape (not a doubled backslash).
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
{ "python", R"#({"code": "def hello():\n print('hey')"})#", "" }
})
.run();
// String escape sequences decode to their actual characters (newline + tab here),
// so a "write a two line file" style call produces real line breaks, not literal "\n".
tst.test("<|tool_call_start|>[python(code=\"First line\\nSecond line\\tindented\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "First line\nSecond line\tindented"})#", "" }
})
.run();
// Escaped quotes inside a string argument survive the round-trip.
tst.test("<|tool_call_start|>[python(code=\"print(\\\"hi\\\")\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "print(\"hi\")"})#", "" }
})
.run();
@@ -2024,6 +2042,61 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
})
.run();
tst.test(
"<tool_call>\n"
"<function=edit>\n"
"<parameter=filename>\n"
"foo.c\n"
"</parameter>\n"
"<parameter=oldString>\n"
"#iclunde\n"
"</parameter>\n"
"<parameter=newString>\n"
"#include\n"
"</parameter>\n"
"</function>\n"
"</tool_call>")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({
edit_tool
})
.expect_tool_calls({
{ "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\"}", {} },
})
.run();
// a parameter value that itself ends in a newline (e.g. a source file with a
// trailing newline). The structural delimiter is "\n</parameter>\n", so the value
// "#include\n" renders as "...#include\n\n</parameter>\n". The trailing newline must
// be preserved faithfully (no stripping), and the generated grammar must admit a
// value ending on a delimiter prefix. Regression test for gbnf_excluding_pattern.
tst.test(
"<tool_call>\n"
"<function=edit>\n"
"<parameter=filename>\n"
"foo.c\n"
"</parameter>\n"
"<parameter=oldString>\n"
"#iclunde\n"
"</parameter>\n"
"<parameter=newString>\n"
"#include\n"
"\n"
"</parameter>\n"
"</function>\n"
"</tool_call>")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({
edit_tool
})
.expect_tool_calls({
{ "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\\n\"}", {} },
})
.run();
// test code that starts with indent
tst.test(
"<tool_call>\n"
+18 -3
View File
@@ -247,7 +247,9 @@ int32_t mtmd_helper_decode_image_chunk(
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past) {
llama_pos * new_n_past,
mtmd_helper_post_decode_callback callback,
void * user_data) {
GGML_ASSERT(n_batch > 0);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
@@ -302,10 +304,23 @@ int32_t mtmd_helper_decode_image_chunk(
int32_t ret = llama_decode(lctx, batch_embd_view);
if (ret != 0) {
LOG_ERR("failed to decode %s\n", name);
llama_set_causal_attn(lctx, true); // restore causal attn
if (use_non_causal) {
llama_set_causal_attn(lctx, true);
}
return ret;
}
if (callback != nullptr) {
ret = callback(batch_embd_view, user_data);
if (ret != 0) {
LOG_ERR("post-decode callback failed\n");
if (use_non_causal) {
llama_set_causal_attn(lctx, true);
}
return ret;
}
}
LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
i_batch++;
@@ -379,7 +394,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
float * embd = mtmd_get_output_embd(ctx);
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past, nullptr, nullptr);
if (ret != 0) {
LOG_ERR("failed to decode %s\n", name);
llama_batch_free(text_batch);
+5 -1
View File
@@ -91,6 +91,8 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
bool logits_last,
llama_pos * new_n_past);
typedef int32_t (*mtmd_helper_post_decode_callback)(struct llama_batch batch, void * user_data);
// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
@@ -101,7 +103,9 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);
llama_pos * new_n_past,
mtmd_helper_post_decode_callback callback,
void * user_data);
//
// video input helpers (requires ffmpeg/ffprobe installed on the system)
+8 -9
View File
@@ -96,16 +96,15 @@ struct mtmd_image_tokens {
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
return (nx + 1) * ny + 2;
}
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
return nx * ny;
}
uint32_t nz = batch_f32.entries.size();
// TODO: simplify this by repeating the last frame until it fits the temporal merge
if (nz % n_temporal_merge != 0) {
nz = nz / n_temporal_merge + 1;
} else {
nz = nz / n_temporal_merge;
if (n_temporal_merge > 1) {
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
// TODO: simplify this by repeating the last frame until it fits the temporal merge
if (nz % n_temporal_merge != 0) {
nz = nz / n_temporal_merge + 1;
} else {
nz = nz / n_temporal_merge;
}
}
return nx * ny * nz;
}
-31
View File
@@ -539,37 +539,6 @@ bool server_tokens::validate(const struct llama_context * ctx) const {
return true;
}
int32_t server_tokens::process_chunk(
llama_context * ctx,
mtmd_context * mctx,
size_t idx,
llama_pos pos,
int32_t seq_id,
size_t & n_tokens_out) const {
const auto & chunk = find_chunk(idx);
const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
? "image" : "audio";
SRV_INF("processing %s...\n", name);
int32_t n_batch = llama_n_batch(ctx);
int64_t t0 = ggml_time_ms();
llama_pos new_n_past; // unused for now
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
chunk.get(),
pos,
seq_id,
n_batch,
true, // logits last
&new_n_past);
SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
if (result != 0) {
LOG_ERR("mtmd_helper_eval failed with status %d", result);
n_tokens_out = 0;
return result;
}
n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
return 0;
}
server_tokens server_tokens::clone() const {
server_tokens res;
res.has_mtmd = has_mtmd;
-9
View File
@@ -221,15 +221,6 @@ public:
// make sure all text tokens are within the vocab range
bool validate(const struct llama_context * ctx) const;
// encode and decode the image chunk
int32_t process_chunk(
llama_context * ctx,
mtmd_context * mctx,
size_t idx,
llama_pos pos,
int32_t seq_id,
size_t & n_tokens_out) const;
server_tokens clone() const;
};
+27 -77
View File
@@ -15,11 +15,6 @@
#include "mtmd.h"
#include "mtmd-helper.h"
#include "ggml-cpp.h"
// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING]
#include "../../src/llama-ext.h"
#include <algorithm>
#include <cstddef>
#include <cinttypes>
@@ -81,7 +76,6 @@ struct server_slot {
// multimodal
mtmd_context * mctx = nullptr;
mtmd::batch_ptr mbatch = nullptr;
std::array<llama_context *, 2> mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context
// speculative decoding
common_speculative * spec;
@@ -244,15 +238,6 @@ struct server_slot {
// clear multimodal state
mbatch.reset();
mtgt[0] = ctx_tgt;
mtgt[1] = nullptr;
if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) {
// TODO: in the future, figure out how to infuse target embeddings to the images
// for now, we re-decode the same chunk in both ctx_tgt and ctx_dft
// maybe we simply need to call `common_speculative_process()` ?
// [TAG_MTMD_DRAFT_PROCESSING]
mtgt[1] = ctx_dft;
}
}
void init_sampler() const {
@@ -598,32 +583,38 @@ struct server_slot {
int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) {
GGML_ASSERT(mctx);
const auto & input_tokens = task->tokens;
auto & chunk = input_tokens.find_chunk(idx);
const auto & chunk = input_tokens.find_chunk(idx);
int32_t res = 0;
auto try_decode = [&]() -> int32_t {
if (mbatch) {
float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get());
if (embd) {
for (auto * lctx : mtgt) {
if (lctx == nullptr) {
continue;
}
llama_pos new_n_past; // unused for now
res = mtmd_helper_decode_image_chunk(
mctx,
lctx,
chunk.get(),
embd,
prompt.tokens.pos_next(),
id,
llama_n_batch(lctx),
&new_n_past
);
if (res != 0) {
SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
return -1;
void * cb_data = spec;
static auto cb = [](llama_batch batch, void * user_data) {
common_speculative * spec = static_cast<common_speculative *>(user_data);
if (!common_speculative_process(spec, batch)) {
return 1;
}
return 0;
};
llama_pos new_n_past; // unused for now
res = mtmd_helper_decode_image_chunk(
mctx,
ctx_tgt,
chunk.get(),
embd,
prompt.tokens.pos_next(),
id,
llama_n_batch(ctx_tgt),
&new_n_past,
cb,
cb_data
);
if (res != 0) {
SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
return -1;
}
n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
return 0; // success
@@ -636,7 +627,8 @@ struct server_slot {
res = try_decode();
if (res == 0) {
return 0;
} else if (res < 0) {
}
if (res < 0) {
// fatal error
return res;
}
@@ -3350,48 +3342,6 @@ private:
// TODO: avoid restoring the draft context and re-evaluating the drafted tokens when not needed [TAG_SPEC_AVOID_DRAFT_REEVAL]
// for now, always re-evaluate for simplicity
// ref: https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4400925384
//
// | spec type | need re-eval |
// | --- | --- |
// | draft model | no | because the draft model does not use embeddings from the target
// | MTP (std) | yes |
// | MTP Gemma4 | no | because the KV cache is shared
// | Eagle3 | yes |
// | DFlash | yes | https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4405406982
//
// note: this logic is now moved in `common_speculative_process()`
// keeping the sketch here until for a bit, until the logic is finalized
//
//if (ctx_dft) {
// // TODO: update as needed for MTP, Eagle3, etc.
// const bool need_tgt_embd = false;
// if (need_tgt_embd) {
// llama_synchronize(ctx_tgt);
// }
// // the logic here varies depending on the speculative decoding method
// // - some draft contexts require embeddings from the target context, others don't
// // - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
// // TODO: extract this in a function ?
// {
// // TODO: hook the embeddings from the last target batch here
// if (llama_model_has_encoder(model_dft.get())) {
// //llama_encode(ctx_dft, ...);
// GGML_ABORT("not implemented yet\n");
// }
// const int ret = llama_decode(ctx_dft.get(), batch_view);
// if (ret != 0) {
// SRV_ERR("failed to decode draft batch, ret = %d\n", ret);
// // TODO: handle error
// break;
// }
// }
//}
if (!common_speculative_process(spec.get(), batch_view)) {
SRV_ERR("%s", "failed to process speculative batch\n");