Compare commits

...

14 Commits

Author SHA1 Message Date
Daniel Bevenius eaf1d7930c llama : add support for Nemotron 3 Super (#20411)
* llama : add support for Nemotron 3 Super

This commit adds support for the Nemotron 3 Super model (120B.A12B)
enabling this model to be converted to GGUF format and run in llama.cpp.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Matt Clayton <156335168+mattjcly@users.noreply.github.com>
2026-03-11 19:27:53 +01:00
Georgi Gerganov 76ea1c1c46 metal : fix capture_compute counter logic (#20410) 2026-03-11 18:38:22 +02:00
Aman Gupta bd1ec818e9 compare-llama-bench: check remotes as well (#20406) 2026-03-12 00:14:42 +08:00
Georgi Gerganov b541241104 metal : fix q5_k mul_mv register spill (#20399) 2026-03-11 16:25:27 +02:00
Georgi Gerganov c363256839 metal : add env var to trigger graph capture (#20398) 2026-03-11 16:25:10 +02:00
Neo Zhang ecac98ee53 [SYCL] Update SYCL.md for binary package for Windows (#20401)
* add download binary package

* update prefix
2026-03-11 22:21:22 +08:00
Ruben Ortlam 182acfe5c5 ci: disable coopmat on ubuntu-24-cmake-vulkan job (#20294) 2026-03-11 14:12:29 +01:00
Aldehir Rojas b5fe4559ae common/parser: use nlohmann::ordered_json to preserve parameter order (#20385) 2026-03-11 10:26:51 +01:00
Piotr Wilkin (ilintar) acb7c79069 common/parser: handle reasoning budget (#20297)
* v1

* Finished!

* Handlie cli

* Reasoning sampler

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Less explosive terminology :)

* Add utf-8 case and tests

* common : migrate reasoning budget sampler to common

* cont : clean up

* cont : expose state and allow passing as initial state

* cont : remove unused imports

* cont : update state machine doc string

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Alde Rojas <hello@alde.dev>
2026-03-11 10:26:12 +01:00
uvos 5f91b1d5d5 ggml-cuda: gdn use shared mem for HIP (#20366)
Suggested-by: Aman Gupta <amangupta052@gmail.com>
2026-03-11 13:06:19 +08:00
uvos 9ef7523ee9 cuda/hip: fix loop unrolling in ssm-conv (#20369) 2026-03-11 13:04:32 +08:00
Pascal 00de615345 Fix agentic mcp image single model (#20339)
* webui: fix MCP image attachments dropped during the agentic loop in single-model mode

* chore: update webui build output
2026-03-11 05:31:33 +01:00
Alessandro de Oliveira Faria (A.K.A.CABELO) e1a399992b vendor : update cpp-httplib to 0.37.0 (#20207) 2026-03-11 11:03:53 +08:00
Alessandro de Oliveira Faria (A.K.A.CABELO) 4f2f0a163d vendor : update miniaudio to 0.11.25 (#20209) 2026-03-11 11:01:56 +08:00
44 changed files with 1077 additions and 204 deletions
+1
View File
@@ -469,6 +469,7 @@ jobs:
cd build
export GGML_VK_VISIBLE_DEVICES=0
export GGML_VK_DISABLE_F16=1
export GGML_VK_DISABLE_COOPMAT=1
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4800
+2
View File
@@ -81,6 +81,8 @@ add_library(${TARGET} STATIC
preset.cpp
preset.h
regex-partial.cpp
reasoning-budget.cpp
reasoning-budget.h
regex-partial.h
sampling.cpp
sampling.h
+31 -2
View File
@@ -2913,6 +2913,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
if (item.key() == "enable_thinking") {
LOG_WRN("Setting 'enable_thinking' via --chat-template-kwargs is deprecated. "
"Use --reasoning on / --reasoning off instead.\n");
}
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
@@ -3048,14 +3052,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.reasoning_format = common_reasoning_format_from_name(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
{"-rea", "--reasoning"}, "[on|off|auto]",
"Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))",
[](common_params & params, const std::string & value) {
if (is_truthy(value)) {
params.enable_reasoning = 1;
params.default_template_kwargs["enable_thinking"] = "true";
} else if (is_falsey(value)) {
params.enable_reasoning = 0;
params.default_template_kwargs["enable_thinking"] = "false";
} else if (is_autoy(value)) {
params.enable_reasoning = -1;
} else {
throw std::invalid_argument(
string_format("error: unknown value for --reasoning: '%s'\n", value.c_str()));
}
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING"));
add_opt(common_arg(
{"--reasoning-budget"}, "N",
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
[](common_params & params, int value) {
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
if (value < -1) { throw std::invalid_argument("invalid value"); }
params.reasoning_budget = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--reasoning-budget-message"}, "MESSAGE",
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
[](common_params & params, const std::string & value) {
params.reasoning_budget_message = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
+3 -1
View File
@@ -135,7 +135,9 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
if (thinking_forced_open || thinking_forced_closed) {
// Thinking is forced open OR forced closed with enable_thinking=true
// In both cases, expect only the closing tag (opening was in template)
return p.reasoning(p.until(end)) + end;
// However, since we might have incorrectly detected the open/close pattern,
// we admit an optional starting marker
return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
}
if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
// Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
+24 -24
View File
@@ -6,7 +6,7 @@
#include <nlohmann/json.hpp>
using json = nlohmann::ordered_json;
using ordered_json = nlohmann::ordered_json;
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
int count = 0;
@@ -68,7 +68,7 @@ static int json_brace_depth(const std::string & s) {
// JSON-escape a string and return the inner content (without surrounding quotes).
static std::string escape_json_string_inner(const std::string & s) {
std::string escaped = json(s).dump();
std::string escaped = ordered_json(s).dump();
if (escaped.size() >= 2 && escaped.front() == '"' && escaped.back() == '"') {
return escaped.substr(1, escaped.size() - 2);
}
@@ -309,7 +309,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
if (arg_count > 0) {
arg_entry = ",";
}
arg_entry += json(trim(node.text)).dump() + ":";
arg_entry += ordered_json(trim(node.text)).dump() + ":";
++arg_count;
auto & target = args_target();
@@ -343,7 +343,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
// Try to parse as JSON value (number, bool, null, object, array)
try {
json parsed = json::parse(value_content);
ordered_json parsed = ordered_json::parse(value_content);
if (parsed.is_string()) {
// Don't add closing quote yet (added by arg_close) for monotonic streaming
std::string escaped = parsed.dump();
@@ -408,7 +408,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
common_peg_parser common_chat_peg_builder::standard_constructed_tools(
const std::map<std::string, std::string> & markers,
const nlohmann::json & tools,
const ordered_json & tools,
bool parallel_tool_calls,
bool force_tool_calls) {
if (!tools.is_array() || tools.empty()) {
@@ -439,7 +439,7 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
// Build argument parsers
auto args = eps();
@@ -479,8 +479,8 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
// Python-style tool calls: name(arg1="value1", arg2=123)
// Used only by LFM2 for now, so we don't merge it into autoparser
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
const nlohmann::json & tools,
bool parallel_tool_calls) {
const ordered_json & tools,
bool parallel_tool_calls) {
if (!tools.is_array() || tools.empty()) {
return eps();
}
@@ -493,7 +493,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto args = eps();
if (params.contains("properties") && !params["properties"].empty()) {
@@ -555,11 +555,11 @@ static std::pair<std::string, std::string> parse_key_spec(const std::string & ke
// Mode 1: function_is_key — parse {"function_name": {...}}
common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
const nlohmann::json & tools,
const std::string & args_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key) {
const ordered_json & tools,
const std::string & args_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key) {
auto tool_choices = choice();
@@ -569,7 +569,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
// Build inner object fields
std::vector<common_peg_parser> inner_fields;
@@ -634,11 +634,11 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
// Mode 2: Nested keys (dot notation like "function.name")
common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
const nlohmann::json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key) {
const ordered_json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key) {
auto tool_choices = choice();
@@ -655,7 +655,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
literal("\"") + tool_name(literal(name)) + literal("\"");
@@ -706,7 +706,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
// Mode 3: Flat keys with optional ID fields and parameter ordering
common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
const nlohmann::json & tools,
const ordered_json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
@@ -723,7 +723,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
literal("\"") + tool_name(literal(name)) + literal("\"");
@@ -791,7 +791,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
common_peg_parser common_chat_peg_builder::standard_json_tools(
const std::string & section_start,
const std::string & section_end,
const nlohmann::json & tools,
const ordered_json & tools,
bool parallel_tool_calls,
bool force_tool_calls,
const std::string & name_key,
+15 -15
View File
@@ -94,7 +94,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {
// parameters_order: order in which JSON fields should be parsed
common_peg_parser standard_json_tools(const std::string & section_start,
const std::string & section_end,
const nlohmann::json & tools,
const nlohmann::ordered_json & tools,
bool parallel_tool_calls,
bool force_tool_calls,
const std::string & name_key = "",
@@ -108,30 +108,30 @@ class common_chat_peg_builder : public common_peg_parser_builder {
// Legacy-compatible helper for building XML/tagged style tool calls
// Used by tests and manual parsers
common_peg_parser standard_constructed_tools(const std::map<std::string, std::string> & markers,
const nlohmann::json & tools,
const nlohmann::ordered_json & tools,
bool parallel_tool_calls,
bool force_tool_calls);
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
// Used by LFM2 and similar templates
common_peg_parser python_style_tool_calls(const nlohmann::json & tools,
bool parallel_tool_calls);
common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
bool parallel_tool_calls);
private:
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
common_peg_parser build_json_tools_function_is_key(const nlohmann::json & tools,
const std::string & args_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key);
common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
const std::string & args_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key);
common_peg_parser build_json_tools_nested_keys(const nlohmann::json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key);
common_peg_parser build_json_tools_nested_keys(const nlohmann::ordered_json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key);
common_peg_parser build_json_tools_flat_keys(const nlohmann::json & tools,
common_peg_parser build_json_tools_flat_keys(const nlohmann::ordered_json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
+18 -4
View File
@@ -857,7 +857,9 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
data.supports_thinking = true;
data.supports_thinking = true;
data.thinking_start_tag = "[THINK]";
data.thinking_end_tag = "[/THINK]";
data.prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.preserved_tokens = {
@@ -1165,9 +1167,11 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
const autoparser::templates_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.thinking_start_tag = "<think>";
data.thinking_end_tag = "</think>";
data.preserved_tokens = {
"<|tool_calls_section_begin|>",
"<|tool_calls_section_end|>",
@@ -1527,6 +1531,16 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
autoparser.analyze_template(tmpl);
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
if (auto_params.supports_thinking) {
auto_params.thinking_start_tag = autoparser.reasoning.start;
auto_params.thinking_end_tag = autoparser.reasoning.end;
// FORCED_OPEN and FORCED_CLOSED both put <think> in the generation prompt
// (FORCED_CLOSED forces empty <think></think> when thinking is disabled,
// but forces <think> open when thinking is enabled)
auto_params.thinking_forced_open =
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
}
return auto_params;
} catch (const std::exception & e) {
throw std::invalid_argument(std::string("Unable to generate parser for this template. Automatic parser generation failed: ") + e.what());
+2
View File
@@ -213,6 +213,8 @@ struct common_chat_params {
bool grammar_lazy = false;
bool thinking_forced_open = false;
bool supports_thinking = false;
std::string thinking_start_tag; // e.g., "<think>"
std::string thinking_end_tag; // e.g., "</think>"
std::vector<common_grammar_trigger> grammar_triggers;
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;
+10
View File
@@ -235,6 +235,14 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
// reasoning budget sampler parameters
// these are populated by the server/CLI based on chat template params
int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
bool reasoning_budget_activate_immediately = false;
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
bool backend_sampling = false;
bool has_logit_bias() const {
@@ -536,7 +544,9 @@ struct common_params {
bool use_jinja = true; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
int reasoning_budget = -1;
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
+219
View File
@@ -0,0 +1,219 @@
#include "reasoning-budget.h"
#include "common.h"
#include "unicode.h"
#include "log.h"
#include <cmath>
#include <cstdint>
#include <string>
#include <vector>
struct token_matcher {
std::vector<llama_token> tokens;
size_t pos = 0;
bool advance(llama_token token) {
if (tokens.empty()) {
return false;
}
if (token == tokens[pos]) {
pos++;
if (pos >= tokens.size()) {
pos = 0;
return true;
}
} else {
pos = 0;
if (token == tokens[0]) {
pos = 1;
}
}
return false;
}
void reset() { pos = 0; }
};
struct common_reasoning_budget_ctx {
const llama_vocab * vocab;
token_matcher start_matcher;
token_matcher end_matcher;
std::vector<llama_token> forced_tokens;
int32_t budget; // maximum tokens in reasoning block
int32_t remaining; // tokens remaining in budget
common_reasoning_budget_state state;
// for forcing
size_t force_pos; // next position in forced_tokens to force
};
static const char * common_reasoning_budget_name(const struct llama_sampler * /*smpl*/) {
return "reasoning-budget";
}
static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_token token) {
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
switch (ctx->state) {
case REASONING_BUDGET_IDLE:
{
if (ctx->start_matcher.advance(token)) {
ctx->state = REASONING_BUDGET_COUNTING;
ctx->remaining = ctx->budget;
LOG_INF("reasoning-budget: activated, budget=%d tokens\n", ctx->budget);
if (ctx->remaining <= 0) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
}
}
break;
}
case REASONING_BUDGET_COUNTING:
case REASONING_BUDGET_WAITING_UTF8:
{
if (ctx->end_matcher.advance(token)) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: deactivated (natural end)\n");
break;
}
bool utf8_complete = true;
if (ctx->vocab != nullptr) {
const std::string piece = common_token_to_piece(ctx->vocab, token, false);
utf8_complete = common_utf8_is_complete(piece);
}
if (ctx->state == REASONING_BUDGET_WAITING_UTF8) {
if (utf8_complete) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
ctx->end_matcher.reset();
LOG_INF("reasoning-budget: UTF-8 complete, now forcing end sequence\n");
}
} else if (ctx->state == REASONING_BUDGET_COUNTING) {
ctx->remaining--;
if (ctx->remaining <= 0) {
if (utf8_complete) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
ctx->end_matcher.reset();
LOG_INF("reasoning-budget: budget exhausted, forcing end sequence\n");
} else {
ctx->state = REASONING_BUDGET_WAITING_UTF8;
ctx->end_matcher.reset();
LOG_INF("reasoning-budget: budget exhausted, waiting for UTF-8 completion\n");
}
}
}
break;
}
case REASONING_BUDGET_FORCING:
// force_pos is advanced in apply(), not here.
// This ensures the first forced token isn't skipped when the sampler
// is initialized directly in FORCING state (e.g. COUNTING + budget=0)
break;
case REASONING_BUDGET_DONE:
break;
}
}
static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
if (ctx->state != REASONING_BUDGET_FORCING) {
// passthrough — don't modify logits
return;
}
if (ctx->force_pos >= ctx->forced_tokens.size()) {
return;
}
const llama_token forced = ctx->forced_tokens[ctx->force_pos];
// set all logits to -inf except the forced token
for (size_t i = 0; i < cur_p->size; i++) {
if (cur_p->data[i].id != forced) {
cur_p->data[i].logit = -INFINITY;
}
}
// advance to next forced token (done here rather than in accept so that
// the first forced token isn't skipped when starting in FORCING state)
ctx->force_pos++;
if (ctx->force_pos >= ctx->forced_tokens.size()) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
}
static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
ctx->state = REASONING_BUDGET_IDLE;
ctx->remaining = ctx->budget;
ctx->start_matcher.reset();
ctx->end_matcher.reset();
ctx->force_pos = 0;
}
static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
return common_reasoning_budget_init(
ctx->vocab,
ctx->start_matcher.tokens,
ctx->end_matcher.tokens,
ctx->forced_tokens,
ctx->budget,
ctx->state);
}
static void common_reasoning_budget_free(struct llama_sampler * smpl) {
delete (common_reasoning_budget_ctx *) smpl->ctx;
}
static struct llama_sampler_i common_reasoning_budget_i = {
/* .name = */ common_reasoning_budget_name,
/* .accept = */ common_reasoning_budget_accept,
/* .apply = */ common_reasoning_budget_apply,
/* .reset = */ common_reasoning_budget_reset,
/* .clone = */ common_reasoning_budget_clone,
/* .free = */ common_reasoning_budget_free,
/* .backend_init = */ nullptr,
/* .backend_accept = */ nullptr,
/* .backend_apply = */ nullptr,
/* .backend_set_input = */ nullptr,
};
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
const std::vector<llama_token> & start_tokens,
const std::vector<llama_token> & end_tokens,
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state) {
// promote COUNTING with budget <= 0 to FORCING
if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) {
initial_state = REASONING_BUDGET_FORCING;
}
return llama_sampler_init(
/* .iface = */ &common_reasoning_budget_i,
/* .ctx = */ new common_reasoning_budget_ctx {
/* .vocab = */ vocab,
/* .start_matcher = */ { start_tokens, 0 },
/* .end_matcher = */ { end_tokens, 0 },
/* .forced_tokens = */ forced_tokens,
/* .budget = */ budget,
/* .remaining = */ budget,
/* .state = */ initial_state,
/* .force_pos = */ 0,
}
);
}
+41
View File
@@ -0,0 +1,41 @@
#pragma once
#include "llama.h"
#include <cstdint>
#include <vector>
enum common_reasoning_budget_state {
REASONING_BUDGET_IDLE, // waiting for start sequence
REASONING_BUDGET_COUNTING, // counting down tokens
REASONING_BUDGET_FORCING, // forcing budget message + end sequence
REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion
REASONING_BUDGET_DONE, // passthrough forever
};
// Creates a reasoning budget sampler that limits token generation inside a
// reasoning block (e.g. between <think> and </think>).
//
// State machine: IDLE -> COUNTING -> WAITING_UTF8 -> FORCING -> DONE
// IDLE: passthrough, watching for start_tokens sequence
// COUNTING: counting down remaining tokens, watching for natural end_tokens
// WAITING_UTF8: budget exhausted, allowing tokens to complete a UTF-8 sequence
// FORCING: forces forced_tokens token-by-token (all other logits -> -inf)
// DONE: passthrough forever
//
// Parameters:
// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr)
// start_tokens - token sequence that activates counting
// end_tokens - token sequence for natural deactivation
// forced_tokens - token sequence forced when budget expires
// budget - max tokens allowed in the reasoning block
// initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
// note: COUNTING with budget <= 0 is promoted to FORCING
//
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
const std::vector<llama_token> & start_tokens,
const std::vector<llama_token> & end_tokens,
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state);
+12
View File
@@ -2,6 +2,7 @@
#include "common.h"
#include "log.h"
#include "reasoning-budget.h"
#include <algorithm>
#include <cmath>
@@ -250,6 +251,17 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
// reasoning budget sampler — added first so it can force tokens before other samplers
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
samplers.push_back(common_reasoning_budget_init(
vocab,
params.reasoning_budget_start,
params.reasoning_budget_end,
params.reasoning_budget_forced,
params.reasoning_budget_tokens,
params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
}
if (params.has_logit_bias()) {
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
}
+17 -1
View File
@@ -1,8 +1,10 @@
#include "unicode.h"
#include <algorithm>
#include <cassert>
#include <stdexcept>
#include <vector>
#include <string>
#include <vector>
// implementation adopted from src/unicode.cpp
@@ -67,6 +69,20 @@ utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t off
return utf8_parse_result(utf8_parse_result::INVALID);
}
bool common_utf8_is_complete(const std::string & s) {
if (s.empty()) {
return true;
}
for (int i = 1; i <= std::min(4, (int)s.size()); i++) {
unsigned char c = s[s.size() - i];
if ((c & 0xC0) != 0x80) {
int expected = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1;
return i >= expected;
}
}
return false;
}
std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
std::string result;
for (size_t i = 0; i < cps.size(); ++i) {
+3
View File
@@ -20,6 +20,9 @@ struct utf8_parse_result {
// Returns 0 for invalid first bytes
size_t common_utf8_sequence_length(unsigned char first_byte);
// Check if a string ends with a complete UTF-8 sequence.
bool common_utf8_is_complete(const std::string & s);
// Parse a single UTF-8 codepoint from input
utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset);
+33 -8
View File
@@ -9743,20 +9743,35 @@ class NemotronHModel(GraniteHybridModel):
# M: Mamba2, *: Attention, -: MLP
# MoE:
# M: Mamba2, *: Attention, E: Expert
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
if pattern is None:
self._ssm_layers = []
self._mlp_layers = []
elif isinstance(pattern, str):
self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"]
self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")]
else:
self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"]
self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"]
def get_attn_layers(self):
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
assert len(hybrid_override_pattern) == self.block_count, "Mismatch between hybrid override and num_hidden_layers!"
return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"]
pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
if pattern is None:
return []
assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!"
if isinstance(pattern, str):
return [i for i, val in enumerate(pattern) if val == "*"]
return [i for i, val in enumerate(pattern) if val == "attention"]
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_key_length(self.head_dim)
self.gguf_writer.add_value_length(self.head_dim)
head_dim = self.head_dim
if head_dim is None:
raise ValueError("Could not find the attention head dim in config")
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
# Set feed_forward_length
# NOTE: This will trigger an override warning. This is preferable to
@@ -9784,6 +9799,9 @@ class NemotronHModel(GraniteHybridModel):
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
if (latent_size := self.hparams.get("moe_latent_size")) is not None:
self.gguf_writer.add_moe_latent_size(latent_size)
def set_vocab(self):
super().set_vocab()
@@ -9803,6 +9821,13 @@ class NemotronHModel(GraniteHybridModel):
name = name[len("language_model."):]
if self.is_moe and bid is not None:
# Skip Multi-Token Prediction (MTP) tensors. These are used for
# for speculative decoding but we don't include them in this model
# conversion. See https://github.com/ggml-org/llama.cpp/pull/18886
if "mtp" in name:
logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}")
return []
if name.endswith("mixer.gate.e_score_correction_bias"):
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
+27 -17
View File
@@ -382,17 +382,27 @@ use 1 SYCL GPUs: [0] with Max compute units:512
## Windows
### I. Setup Environment
1. Install GPU driver
### Install GPU driver
Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
2. Install Visual Studio
### Option 1: download the binary package directly
Download the binary package for Windows from: https://github.com/ggml-org/llama.cpp/releases.
Extract the package to local folder, run the llama tools directly. Refer to [Run the inference](#iii-run-the-inference-1).
Note, the package includes the SYCL running time and all depended dll files, no need to install oneAPI package and activte them.
### Option 2: build locally from the source code.
#### I. Setup environment
1. Install Visual Studio
If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
3. Install Intel® oneAPI Base toolkit
2. Install Intel® oneAPI Base toolkit
SYCL backend depends on:
- Intel® oneAPI DPC++/C++ compiler/running-time.
@@ -443,25 +453,25 @@ Output (example):
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
```
4. Install build tools
3. Install build tools
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
### II. Build llama.cpp
#### II. Build llama.cpp
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
Choose one of following methods to build from source code.
#### 1. Script
##### Option 1: Script
```sh
.\examples\sycl\win-build-sycl.bat
```
#### 2. CMake
##### Option 2: CMake
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
@@ -490,7 +500,7 @@ cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-completion
```
#### 3. Visual Studio
##### Option 3: Visual Studio
You have two options to use Visual Studio to build llama.cpp:
- As CMake Project using CMake presets.
@@ -500,7 +510,7 @@ You have two options to use Visual Studio to build llama.cpp:
All following commands are executed in PowerShell.
##### - Open as a CMake Project
###### - Open as a CMake Project
You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
@@ -515,7 +525,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
cmake --build build --config Release -j --target llama-completion
```
##### - Generating a Visual Studio Solution
###### - Generating a Visual Studio Solution
You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
@@ -603,7 +613,7 @@ found 2 SYCL devices:
```
#### Choose level-zero devices
##### Choose level-zero devices
|Chosen Device ID|Setting|
|-|-|
@@ -611,7 +621,7 @@ found 2 SYCL devices:
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|
#### Execute
##### Execute
Choose one of following methods to run.
@@ -669,7 +679,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
## Environment Variable
#### Build
### Build
| Name | Value | Function |
|--------------------|---------------------------------------|---------------------------------------------|
@@ -684,7 +694,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
#### Runtime
### Runtime
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
@@ -777,7 +787,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
```
### **GitHub contribution**:
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
Please add the `[SYCL]` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
## TODO
+56 -29
View File
@@ -2,28 +2,29 @@
#include "ggml-cuda/common.cuh"
template <int S_v, bool KDA>
__global__ void gated_delta_net_cuda(const float * q,
const float * k,
const float * v,
const float * g,
const float * beta,
const float * curr_state,
float * dst,
int64_t H,
int64_t n_tokens,
int64_t n_seqs,
int64_t sq1,
int64_t sq2,
int64_t sq3,
int64_t sv1,
int64_t sv2,
int64_t sv3,
int64_t sb1,
int64_t sb2,
int64_t sb3,
int64_t rq1,
int64_t rq3,
float scale) {
__global__ void __launch_bounds__(S_v, 1)
gated_delta_net_cuda(const float * q,
const float * k,
const float * v,
const float * g,
const float * beta,
const float * curr_state,
float * dst,
const int64_t H,
const int64_t n_tokens,
const int64_t n_seqs,
const int64_t sq1,
const int64_t sq2,
const int64_t sq3,
const int64_t sv1,
const int64_t sv2,
const int64_t sv3,
const int64_t sb1,
const int64_t sb2,
const int64_t sb3,
const int64_t rq1,
const int64_t rq3,
const float scale) {
const int64_t h_idx = blockIdx.x;
const int64_t sequence = blockIdx.y;
const int col = threadIdx.x; // each thread owns one column
@@ -40,8 +41,14 @@ __global__ void gated_delta_net_cuda(const float * q,
curr_state += state_offset;
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
// Load state column into registers
// GCN and CDNA devices spill registers, we use shared mem for them. See https://github.com/ggml-org/llama.cpp/pull/20282#issuecomment-4025770229
// TODO: check optimal path for RDNA1 and RDNA2 devices.
#if (defined(GGML_USE_HIP) && !defined(RDNA3) && !defined(RDNA4)) || defined(GGML_USE_MUSA)
extern __shared__ float s_shared[];
float * s = s_shared + col * S_v;
#else
float s[S_v];
#endif
#pragma unroll
for (int i = 0; i < S_v; i++) {
s[i] = curr_state[i * S_v + col];
@@ -114,6 +121,15 @@ __global__ void gated_delta_net_cuda(const float * q,
}
}
static size_t calculate_smem(const int sv, int cc)
{
size_t smem = 0;
if ((GGML_CUDA_CC_IS_AMD(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_RDNA4(cc)) || GGML_CUDA_CC_IS_MTHREADS(cc)) {
smem = sv * sv * sizeof(float);
}
return smem;
}
template <bool KDA>
static void launch_gated_delta_net(
const float * q_d, const float * k_d, const float * v_d,
@@ -129,25 +145,36 @@ static void launch_gated_delta_net(
dim3 grid_dims(H, n_seqs, 1);
dim3 block_dims(S_v, 1, 1);
int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
switch (S_v) {
case 32:
gated_delta_net_cuda<32, KDA><<<grid_dims, block_dims, 0, stream>>>(
case 32: {
constexpr int sv = 32;
size_t smem = calculate_smem(sv, cc);
gated_delta_net_cuda<sv, KDA><<<grid_dims, block_dims, smem, stream>>>(
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, rq1, rq3, scale);
break;
case 64:
gated_delta_net_cuda<64, KDA><<<grid_dims, block_dims, 0, stream>>>(
}
case 64: {
constexpr int sv = 64;
size_t smem = calculate_smem(sv, cc);
gated_delta_net_cuda<sv, KDA><<<grid_dims, block_dims, smem, stream>>>(
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, rq1, rq3, scale);
break;
case 128:
gated_delta_net_cuda<128, KDA><<<grid_dims, block_dims, 0, stream>>>(
}
case 128: {
constexpr int sv = 128;
size_t smem = calculate_smem(sv, cc);
gated_delta_net_cuda<sv, KDA><<<grid_dims, block_dims, smem, stream>>>(
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, rq1, rq3, scale);
break;
}
default:
GGML_ABORT("fatal error");
break;
+4 -1
View File
@@ -76,7 +76,7 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
int row = tid / load_cols;
int col = tid % load_cols;
#pragma unroll
for (int idx = tid; idx < total_elems; idx += split_d_inner) {
for (int idx = 0; idx < total_elems; idx += split_d_inner) {
if (row < (int)split_d_inner) {
smem[row * n_cols + col] = x_block[row * stride_x + col];
}
@@ -84,6 +84,9 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
col += split_d_inner;
row += col / load_cols;
col = col % load_cols;
if (idx >= total_elems - tid - split_d_inner) {
break;
}
}
__syncthreads();
+22 -7
View File
@@ -47,7 +47,7 @@ struct ggml_metal {
uint64_t fuse_cnt[GGML_OP_COUNT];
// capture state
bool capture_next_compute;
int capture_compute;
bool capture_started;
id<MTLCaptureScope> capture_scope;
@@ -158,10 +158,17 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
res->capture_next_compute = false;
res->capture_compute = 0;
res->capture_started = false;
res->capture_scope = nil;
{
const char * val = getenv("GGML_METAL_CAPTURE_COMPUTE");
if (val) {
res->capture_compute = atoi(val);
}
}
res->has_error = false;
res->gf = nil;
@@ -458,9 +465,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
const bool use_capture = ctx->capture_next_compute;
if (ctx->capture_compute >= 0) {
ctx->capture_compute--;
}
const bool use_capture = ctx->capture_compute == 0;
if (use_capture) {
ctx->capture_next_compute = false;
ctx->capture_compute = -1;
// make sure all previous computations have finished before starting the capture
if (ctx->cmd_buf_last) {
@@ -469,6 +480,10 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
}
if (!ctx->capture_started) {
NSString * path = [NSString stringWithFormat:@"/tmp/perf-metal-%d.gputrace", getpid()];
GGML_LOG_WARN("%s: capturing graph in %s\n", __func__, [path UTF8String]);
// create capture scope
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
@@ -476,7 +491,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
descriptor.captureObject = ctx->capture_scope;
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
descriptor.outputURL = [NSURL fileURLWithPath:path];
NSError * error = nil;
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
@@ -683,7 +698,7 @@ void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
idx_end,
ctx->use_fusion,
ctx->use_concurrency,
ctx->capture_next_compute,
ctx->capture_compute,
ctx->debug_graph,
ctx->debug_fusion);
@@ -718,5 +733,5 @@ bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
}
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
ctx->capture_next_compute = true;
ctx->capture_compute = 1;
}
+1 -1
View File
@@ -35,7 +35,7 @@
#define N_R0_Q4_K 2
#define N_SG_Q4_K 2
#define N_R0_Q5_K 2
#define N_R0_Q5_K 1
#define N_SG_Q5_K 2
#define N_R0_Q6_K 2
+1
View File
@@ -9081,6 +9081,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
kernel void kernel_mul_mm_id(
+8
View File
@@ -125,6 +125,7 @@ class Keys:
EXPERT_GROUP_SCALE = "{arch}.expert_group_scale"
EXPERTS_PER_GROUP = "{arch}.experts_per_group"
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
MOE_LATENT_SIZE = "{arch}.moe_latent_size"
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers"
POOLING_TYPE = "{arch}.pooling_type"
@@ -543,6 +544,8 @@ class MODEL_TENSOR(IntEnum):
FFN_DOWN_CHEXP = auto()
FFN_UP_CHEXP = auto()
FFN_EXP_PROBS_B = auto()
MOE_LATENT_DOWN = auto() # nemotron 3 super
MOE_LATENT_UP = auto() # nemotron 3 super
ATTN_Q_NORM = auto()
ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto()
@@ -986,6 +989,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps",
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
@@ -2913,6 +2918,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
# expert latent
MODEL_TENSOR.MOE_LATENT_DOWN,
MODEL_TENSOR.MOE_LATENT_UP,
# shared expert
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
+3
View File
@@ -859,6 +859,9 @@ class GGUFWriter:
def add_moe_every_n_layers(self, value: int) -> None:
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
def add_moe_latent_size(self, value: int) -> None:
self.add_uint32(Keys.LLM.MOE_LATENT_SIZE.format(arch=self.arch), value)
def add_nextn_predict_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
+8
View File
@@ -571,6 +571,14 @@ class TensorNameMap:
"model.layers.{bid}.mlp.experts.gate_up_proj",
),
MODEL_TENSOR.MOE_LATENT_DOWN: (
"backbone.layers.{bid}.mixer.fc1_latent_proj", # nemotron 3 super
),
MODEL_TENSOR.MOE_LATENT_UP: (
"backbone.layers.{bid}.mixer.fc2_latent_proj", # nemotron 3 super
),
# Feed-forward down
MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
+4
View File
@@ -293,6 +293,10 @@ class LlamaBenchData:
for t in self.repo.tags:
if t.name == name:
return t.commit.hexsha[:self.build_len]
for remote in self.repo.remotes:
for ref in remote.refs:
if ref.name == name or ref.remote_head == name:
return ref.commit.hexsha[:self.build_len]
for c in self.repo.iter_commits("--all"):
if c.hexsha[:self.build_len] == name[:self.build_len]:
return c.hexsha[:self.build_len]
+2 -2
View File
@@ -5,7 +5,7 @@ import os
import sys
import subprocess
HTTPLIB_VERSION = "refs/tags/v0.35.0"
HTTPLIB_VERSION = "refs/tags/v0.37.0"
vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
@@ -15,7 +15,7 @@ vendor = {
# not using latest tag to avoid this issue: https://github.com/ggml-org/llama.cpp/pull/17179#discussion_r2515877926
# "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.24/miniaudio.h": "vendor/miniaudio/miniaudio.h",
"https://github.com/mackron/miniaudio/raw/13d161bc8d856ad61ae46b798bbeffc0f49808e8/miniaudio.h": "vendor/miniaudio/miniaudio.h",
"https://github.com/mackron/miniaudio/raw/9634bedb5b5a2ca38c1ee7108a9358a4e233f14d/miniaudio.h": "vendor/miniaudio/miniaudio.h",
f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/httplib.h": "httplib.h",
f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/split.py": "split.py",
+8
View File
@@ -185,6 +185,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
@@ -365,6 +366,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
{ LLM_TENSOR_FFN_LATENT_DOWN, "blk.%d.ffn_latent_down" },
{ LLM_TENSOR_FFN_LATENT_UP, "blk.%d.ffn_latent_up" },
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
@@ -1879,6 +1882,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_FFN_LATENT_DOWN,
LLM_TENSOR_FFN_LATENT_UP,
// MoE shared expert layer
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
@@ -2754,6 +2759,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
// Nemotron 3 Super
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
+3
View File
@@ -189,6 +189,7 @@ enum llm_kv {
LLM_KV_EXPERT_GROUP_SCALE,
LLM_KV_EXPERTS_PER_GROUP,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_MOE_LATENT_SIZE,
LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_NUM_DEEPSTACK_LAYERS,
LLM_KV_POOLING_TYPE,
@@ -385,6 +386,8 @@ enum llm_tensor {
LLM_TENSOR_FFN_GATE_CHEXPS,
LLM_TENSOR_FFN_UP_CHEXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_FFN_LATENT_DOWN,
LLM_TENSOR_FFN_LATENT_UP,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM,
+1
View File
@@ -89,6 +89,7 @@ struct llama_hparams {
bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
uint32_t moe_latent_size = 0;
uint32_t nextn_predict_layers = 0;
float f_norm_eps;
+9 -2
View File
@@ -135,6 +135,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_102B_A12B: return "102B.A12B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_120B_A12B: return "120B.A12B";
case LLM_TYPE_122B_A10B: return "122B.A10B";
case LLM_TYPE_196B_A11B: return "196B.A11B";
case LLM_TYPE_230B_A10B: return "230B.A10B";
@@ -1861,10 +1862,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false);
switch (hparams.n_layer) {
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
case 56: type = LLM_TYPE_9B; break;
case 88: type = LLM_TYPE_120B_A12B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@@ -5544,6 +5547,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_ssm_head = hparams.ssm_dt_rank;
const int64_t n_group = hparams.ssm_n_group;
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
// embeddings
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5603,8 +5607,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
// MoE branch
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_latent_up = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP, "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, moe_n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
// Shared expert branch
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
+5
View File
@@ -126,6 +126,7 @@ enum llm_type {
LLM_TYPE_100B_A6B,
LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_120B_A12B, // Nemotron 3 Super
LLM_TYPE_122B_A10B, // Qwen3.5
LLM_TYPE_196B_A11B, // Step3.5-Flash
LLM_TYPE_230B_A10B, // Minimax M2
@@ -294,6 +295,10 @@ struct llama_layer {
struct ggml_tensor * ffn_up_exps_b = nullptr;
struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
// ff MoE latent proj
struct ggml_tensor * ffn_latent_down = nullptr;
struct ggml_tensor * ffn_latent_up = nullptr;
// ff shared expert (shexp)
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
struct ggml_tensor * ffn_gate_shexp = nullptr;
+18 -4
View File
@@ -114,9 +114,18 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
ggml_tensor * ffn_inp = cur;
ggml_tensor * inp_emb = cur;
ggml_tensor * inp_latent = cur;
if (model.layers[il].ffn_latent_down) {
inp_latent = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_down, cur);
}
ggml_tensor * router_logits = build_lora_mm(model.layers[il].ffn_gate_inp, cur);
cb(router_logits, "ffn_moe_logits", il);
ggml_tensor * moe_out =
build_moe_ffn(ffn_inp,
build_moe_ffn(inp_latent,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr, // no gate
@@ -126,10 +135,15 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
il,
router_logits);
cb(moe_out, "ffn_moe_out", il);
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
if (model.layers[il].ffn_latent_up) {
moe_out = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_up, moe_out);
}
ggml_tensor * ffn_shexp = build_ffn(inp_emb,
model.layers[il].ffn_up_shexp, NULL, NULL,
NULL /* no gate */ , NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
+1
View File
@@ -149,6 +149,7 @@ endif ()
if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API (when building with shared libraries)
llama_build_and_test(test-sampling.cpp)
llama_build_and_test(test-reasoning-budget.cpp)
llama_build_and_test(test-grammar-parser.cpp)
llama_build_and_test(test-grammar-integration.cpp)
llama_build_and_test(test-llama-grammar.cpp)
+238
View File
@@ -0,0 +1,238 @@
#include "reasoning-budget.h"
#include "unicode.h"
#include "llama.h"
#include "ggml.h"
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <cmath>
#include <cstddef>
#include <cstdio>
#include <string>
#include <vector>
// Reasoning budget sampler test helper
// These tests use nullptr vocab which safely falls back to treating all tokens as complete
// (The UTF-8 boundary detection logic is tested separately in test_utf8_boundary_detection)
static void test_reasoning_budget(
const char * test_name,
const std::vector<llama_token> & sequence,
const std::vector<llama_token> & start_tokens,
const std::vector<llama_token> & end_tokens,
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state,
size_t expected_force_start, // token index where forcing should start (SIZE_MAX = never)
size_t expected_force_end // token index where forcing should end (after this, no more forcing)
) {
// Find the maximum token ID to ensure our vocab covers all tokens
llama_token max_token = 0;
for (auto t : sequence) max_token = std::max(max_token, t);
for (auto t : start_tokens) max_token = std::max(max_token, t);
for (auto t : end_tokens) max_token = std::max(max_token, t);
for (auto t : forced_tokens) max_token = std::max(max_token, t);
// Create a minimal sampler with mock vocabulary
// For this test, we use nullptr as vocab since we're testing state transitions
// The UTF-8 boundary check will treat all tokens as complete (safe fallback)
auto * sampler = common_reasoning_budget_init(
nullptr, // vocab - not used for basic state machine tests
start_tokens,
end_tokens,
forced_tokens,
budget,
initial_state
);
// Create a test token data array for checking forcing behavior
// Vocab size must be large enough to include all tokens (start, end, forced, sequence)
std::vector<llama_token_data> cur;
const size_t n_vocab = (size_t)max_token + 1;
for (size_t i = 0; i < n_vocab; i++) {
cur.emplace_back(llama_token_data{(llama_token)i, logf((float)(i+1)), 0.0f});
}
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
size_t actual_force_start = SIZE_MAX;
size_t actual_force_end = SIZE_MAX;
// Feed the sequence and track when forcing occurs
for (size_t i = 0; i < sequence.size(); i++) {
llama_sampler_accept(sampler, sequence[i]);
// Check if we're in forcing state by applying and seeing if logits are modified
cur_p.selected = -1;
for (size_t j = 0; j < cur.size(); j++) {
cur[j].logit = logf((float)(j+1)); // reset logits
}
llama_sampler_apply(sampler, &cur_p);
// Check if forcing is active (all logits except one should be -INFINITY)
size_t finite_count = 0;
llama_token finite_token = -1;
for (size_t j = 0; j < cur.size(); j++) {
if (std::isfinite(cur[j].logit)) {
finite_count++;
finite_token = cur[j].id;
}
}
fprintf(stderr, " i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
if (finite_count == 1) {
if (actual_force_start == SIZE_MAX) {
actual_force_start = i;
}
actual_force_end = i;
} else if (actual_force_start != SIZE_MAX && actual_force_end != SIZE_MAX) {
// Forcing stopped
break;
}
}
llama_sampler_free(sampler);
// Verify forcing occurred at expected positions
if (expected_force_start == SIZE_MAX) {
if (actual_force_start != SIZE_MAX) {
fprintf(stderr, "Test '%s' FAILED: Expected no forcing, but forcing occurred at %zu\n", test_name, actual_force_start);
GGML_ASSERT(false && "Expected no forcing, but forcing occurred");
}
} else {
if (actual_force_start == SIZE_MAX) {
fprintf(stderr, "Test '%s' FAILED: Expected forcing but none occurred\n", test_name);
GGML_ASSERT(false && "Expected forcing but none occurred");
}
if (actual_force_start != expected_force_start) {
fprintf(stderr, "Test '%s' FAILED: Forcing started at %zu, expected %zu\n", test_name, actual_force_start, expected_force_start);
GGML_ASSERT(false && "Forcing started at wrong position");
}
}
if (expected_force_end != SIZE_MAX) {
if (actual_force_end < expected_force_end) {
fprintf(stderr, "Test '%s' FAILED: Forcing ended at %zu, expected >= %zu\n", test_name, actual_force_end, expected_force_end);
GGML_ASSERT(false && "Forcing ended too early");
}
}
fprintf(stderr, " Test '%s' passed (force_start=%zu, force_end=%zu)\n", test_name, actual_force_start, actual_force_end);
(void)sequence;
}
// UTF-8 boundary detection unit test
// Tests common_utf8_is_complete() from reasoning-budget.h
static void test_utf8_boundary_detection() {
// Complete sequences
GGML_ASSERT(common_utf8_is_complete("hello"));
GGML_ASSERT(common_utf8_is_complete(""));
GGML_ASSERT(common_utf8_is_complete("\xC2\xA0")); // complete 2-byte UTF-8 (U+00A0)
GGML_ASSERT(common_utf8_is_complete("\xE2\x80\x9C")); // complete 3-byte UTF-8 (left double quote)
GGML_ASSERT(common_utf8_is_complete("\xF0\x9F\x98\x80")); // complete 4-byte UTF-8 (emoji)
GGML_ASSERT(common_utf8_is_complete("abc\xC3\xA9")); // ASCII + complete 2-byte
// Incomplete sequences
GGML_ASSERT(!common_utf8_is_complete(std::string("\xC2", 1))); // 2-byte start, missing continuation
GGML_ASSERT(!common_utf8_is_complete(std::string("\xE2\x80", 2))); // 3-byte start + 1 cont, missing 1
GGML_ASSERT(!common_utf8_is_complete(std::string("\xE2", 1))); // 3-byte start, missing 2
GGML_ASSERT(!common_utf8_is_complete(std::string("\xF0\x9F\x98", 3))); // 4-byte start + 2 cont, missing 1
GGML_ASSERT(!common_utf8_is_complete(std::string("\xF0\x9F", 2))); // 4-byte start + 1 cont, missing 2
GGML_ASSERT(!common_utf8_is_complete(std::string("\xF0", 1))); // 4-byte start, missing 3
GGML_ASSERT(!common_utf8_is_complete(std::string("\x80", 1))); // orphan continuation byte
// Mixed: ASCII followed by start of multi-byte
GGML_ASSERT(!common_utf8_is_complete(std::string("hello\xC3", 6))); // ASCII + incomplete 2-byte
GGML_ASSERT(common_utf8_is_complete(std::string("hello\xC3\xA9", 7))); // ASCII + complete 2-byte
}
int main(void) {
// Reasoning budget sampler tests
printf("Testing reasoning budget sampler... ");
// Test 1: Basic budget with start/end tokens - no forcing (natural end before budget exhausted)
{
const std::vector<llama_token> start = {100}; // start token
const std::vector<llama_token> end = {101}; // end token
const std::vector<llama_token> forced = {102}; // forced token (not used in this test)
const std::vector<llama_token> sequence = {100, 50, 51, 101, 52}; // start, two tokens, end, one more
test_reasoning_budget("natural end before budget exhausted", sequence, start, end, forced,
5, // budget of 5 tokens
REASONING_BUDGET_IDLE,
SIZE_MAX, SIZE_MAX); // no forcing expected (natural end)
}
// Test 2: Budget exhausted, forcing should occur
// Flow: i=0 accept(100)->COUNTING, i=1 accept(50)->remaining=1, i=2 accept(51)->remaining=0->FORCING
// Forcing is active at i=2 and i=3 (when apply() is called while in FORCING state)
// At i=4, force_pos becomes 2 which equals forced_tokens.size(), so state becomes DONE
{
const std::vector<llama_token> start = {100};
const std::vector<llama_token> end = {101};
const std::vector<llama_token> forced = {102, 101}; // forced message + end
const std::vector<llama_token> sequence = {100, 50, 51, 52, 53}; // start + 4 tokens (budget=2)
test_reasoning_budget("budget exhausted forcing", sequence, start, end, forced,
2, // budget of 2 tokens
REASONING_BUDGET_IDLE,
2, // forcing starts at i=2 (after accept(51) depletes budget, apply() forces)
3); // forcing continues through i=3 (at i=4 state becomes DONE)
}
// Test 3: Activate immediately with budget=0, forcing should start right away
// Flow: Since no start token in sequence, state stays IDLE (no start/end configured means passthrough)
// This test needs start token to be in the sequence or use activate_immediately with start token present
{
const std::vector<llama_token> start = {100};
const std::vector<llama_token> end = {101};
const std::vector<llama_token> forced = {102, 101};
const std::vector<llama_token> sequence = {100, 50, 51, 52}; // start token first, then 3 tokens
test_reasoning_budget("activate immediately budget=0", sequence, start, end, forced,
0, // budget of 0 tokens
REASONING_BUDGET_COUNTING, // starts counting, promoted to FORCING since budget=0
0, // forcing starts at i=0 (after accept(100), budget=0 goes straight to FORCING)
1); // forcing continues through i=1 (at i=2 state becomes DONE)
}
// Test 4: No start/end tokens configured - passthrough (no forcing)
{
const std::vector<llama_token> start = {};
const std::vector<llama_token> end = {};
const std::vector<llama_token> forced = {102};
const std::vector<llama_token> sequence = {50, 51, 52, 53};
test_reasoning_budget("no start/end configured", sequence, start, end, forced,
2, // budget
REASONING_BUDGET_IDLE,
SIZE_MAX, SIZE_MAX); // no forcing (no start/end configured)
}
// Test 5: Activate immediately with budget > 0, count down then force
// Flow: i=0 accept(50)->remaining=1, i=1 accept(51)->remaining=0->FORCING
// So forcing starts at i=1 (apply after accept sees FORCING with force_pos=0)
{
const std::vector<llama_token> start = {100};
const std::vector<llama_token> end = {101};
const std::vector<llama_token> forced = {102, 101};
const std::vector<llama_token> sequence = {50, 51, 52, 53};
test_reasoning_budget("activate immediately with budget", sequence, start, end, forced,
2, // budget of 2 tokens
REASONING_BUDGET_COUNTING,
1, // forcing starts at i=1 (after 2 accepts deplete budget)
2); // forcing continues through i=2
}
printf("OK (5 tests passed)\n");
printf("Testing UTF-8 boundary detection... ");
test_utf8_boundary_detection();
printf("OK\n");
return 0;
}
+22
View File
@@ -57,6 +57,8 @@ struct cli_context {
std::vector<raw_buffer> input_files;
task_params defaults;
bool verbose_prompt;
int reasoning_budget = -1;
std::string reasoning_budget_message;
// thread for showing "loading" animation
std::atomic<bool> loading_show;
@@ -73,6 +75,8 @@ struct cli_context {
// defaults.return_progress = true; // TODO: show progress
verbose_prompt = params.verbose_prompt;
reasoning_budget = params.reasoning_budget;
reasoning_budget_message = params.reasoning_budget_message;
}
std::string generate_completion(result_timings & out_timings) {
@@ -95,6 +99,24 @@ struct cli_context {
task.params.chat_parser_params.parser.load(chat_params.parser);
}
// reasoning budget sampler
if (reasoning_budget >= 0 && !chat_params.thinking_end_tag.empty()) {
const llama_vocab * vocab = llama_model_get_vocab(
llama_get_model(ctx_server.get_llama_context()));
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
if (!chat_params.thinking_start_tag.empty()) {
task.params.sampling.reasoning_budget_start =
common_tokenize(vocab, chat_params.thinking_start_tag, false, true);
}
task.params.sampling.reasoning_budget_end =
common_tokenize(vocab, chat_params.thinking_end_tag, false, true);
task.params.sampling.reasoning_budget_forced =
common_tokenize(vocab, reasoning_budget_message + chat_params.thinking_end_tag, false, true);
}
rd.post_task({std::move(task)});
}
Binary file not shown.
+16
View File
@@ -1101,6 +1101,22 @@ json oaicompat_chat_params_parse(
llama_params["chat_parser"] = chat_params.parser;
}
// Reasoning budget: pass parameters through to sampling layer
{
int reasoning_budget = opt.reasoning_budget;
if (reasoning_budget == -1 && body.contains("thinking_budget_tokens")) {
reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
}
if (reasoning_budget >= 0 && !chat_params.thinking_end_tag.empty()) {
llama_params["reasoning_budget_tokens"] = reasoning_budget;
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open;
}
}
// Handle "logprobs" field
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
if (json_value(body, "logprobs", false)) {
+2
View File
@@ -287,6 +287,8 @@ struct server_chat_params {
bool allow_image;
bool allow_audio;
bool enable_thinking = true;
int reasoning_budget = -1;
std::string reasoning_budget_message;
std::string media_path;
};
+5 -2
View File
@@ -893,9 +893,10 @@ private:
}
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 1. It's not explicitly disabled via --reasoning off
// 2. The chat template supports it
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
const bool template_supports_thinking = params_base.use_jinja && common_chat_templates_support_enable_thinking(chat_templates.get());
const bool enable_thinking = params_base.enable_reasoning != 0 && template_supports_thinking;
SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
chat_params = {
@@ -907,6 +908,8 @@ private:
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
/* enable_thinking */ enable_thinking,
/* reasoning_budget */ params_base.reasoning_budget,
/* reasoning_budget_msg */ params_base.reasoning_budget_message,
/* media_path */ params_base.media_path,
};
}
+28
View File
@@ -462,6 +462,34 @@ task_params server_task::params_from_json_cmpl(
}
}
// Parse reasoning budget sampler parameters
{
const int32_t budget = json_value(data, "reasoning_budget_tokens", (int32_t) -1);
if (budget >= 0) {
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
const auto message = json_value(data, "reasoning_budget_message", std::string());
const bool activate_imm = json_value(data, "reasoning_budget_activate_immediately", false);
params.sampling.reasoning_budget_tokens = budget;
params.sampling.reasoning_budget_activate_immediately = activate_imm;
if (!start_tag.empty()) {
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
}
if (!end_tag.empty()) {
params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true);
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
}
SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n",
budget, activate_imm ? "true" : "false",
params.sampling.reasoning_budget_start.size(),
params.sampling.reasoning_budget_end.size(),
params.sampling.reasoning_budget_forced.size());
}
}
{
params.sampling.logit_bias.clear();
@@ -318,6 +318,12 @@ class AgenticStore {
const maxTurns = agenticConfig.maxTurns;
const maxToolPreviewLines = agenticConfig.maxToolPreviewLines;
// Resolve effective model for vision capability checks.
// In ROUTER mode, options.model is always set by the caller.
// In MODEL mode, options.model is undefined; use the single loaded model
// which carries modalities bridged from /props.
const effectiveModel = options.model || modelsStore.models[0]?.model || '';
for (let turn = 0; turn < maxTurns; turn++) {
this.updateSession(conversationId, { currentTurn: turn + 1 });
agenticTimings.turns = turn + 1;
@@ -571,14 +577,14 @@ class AgenticStore {
];
for (const attachment of attachments) {
if (attachment.type === AttachmentType.IMAGE) {
if (modelsStore.modelSupportsVision(options.model ?? '')) {
if (modelsStore.modelSupportsVision(effectiveModel)) {
contentParts.push({
type: ContentPartType.IMAGE_URL,
image_url: { url: (attachment as DatabaseMessageExtraImageFile).base64Url }
});
} else {
console.info(
`[AgenticStore] Skipping image attachment (model "${options.model}" does not support vision)`
`[AgenticStore] Skipping image attachment (model "${effectiveModel}" does not support vision)`
);
}
}
+84 -48
View File
@@ -813,17 +813,13 @@ bool is_websocket_upgrade(const Request &req) {
// Check Upgrade: websocket (case-insensitive)
auto upgrade_it = req.headers.find("Upgrade");
if (upgrade_it == req.headers.end()) { return false; }
auto upgrade_val = upgrade_it->second;
std::transform(upgrade_val.begin(), upgrade_val.end(), upgrade_val.begin(),
::tolower);
auto upgrade_val = case_ignore::to_lower(upgrade_it->second);
if (upgrade_val != "websocket") { return false; }
// Check Connection header contains "Upgrade"
auto connection_it = req.headers.find("Connection");
if (connection_it == req.headers.end()) { return false; }
auto connection_val = connection_it->second;
std::transform(connection_val.begin(), connection_val.end(),
connection_val.begin(), ::tolower);
auto connection_val = case_ignore::to_lower(connection_it->second);
if (connection_val.find("upgrade") == std::string::npos) { return false; }
// Check Sec-WebSocket-Key is a valid base64-encoded 16-byte value (24 chars)
@@ -2615,10 +2611,15 @@ bool can_compress_content_type(const std::string &content_type) {
switch (tag) {
case "image/svg+xml"_t:
case "application/javascript"_t:
case "application/x-javascript"_t:
case "application/json"_t:
case "application/ld+json"_t:
case "application/xml"_t:
case "application/protobuf"_t:
case "application/xhtml+xml"_t: return true;
case "application/xhtml+xml"_t:
case "application/rss+xml"_t:
case "application/atom+xml"_t:
case "application/xslt+xml"_t:
case "application/protobuf"_t: return true;
case "text/event-stream"_t: return false;
@@ -3038,17 +3039,13 @@ bool read_websocket_upgrade_response(Stream &strm,
// Verify Upgrade: websocket (case-insensitive)
auto upgrade_it = headers.find("Upgrade");
if (upgrade_it == headers.end()) { return false; }
auto upgrade_val = upgrade_it->second;
std::transform(upgrade_val.begin(), upgrade_val.end(), upgrade_val.begin(),
::tolower);
auto upgrade_val = case_ignore::to_lower(upgrade_it->second);
if (upgrade_val != "websocket") { return false; }
// Verify Connection header contains "Upgrade" (case-insensitive)
auto connection_it = headers.find("Connection");
if (connection_it == headers.end()) { return false; }
auto connection_val = connection_it->second;
std::transform(connection_val.begin(), connection_val.end(),
connection_val.begin(), ::tolower);
auto connection_val = case_ignore::to_lower(connection_it->second);
if (connection_val.find("upgrade") == std::string::npos) { return false; }
// Verify Sec-WebSocket-Accept header value
@@ -3934,14 +3931,10 @@ public:
file_.content_type =
trim_copy(header.substr(str_len(header_content_type)));
} else {
thread_local const std::regex re_content_disposition(
R"~(^Content-Disposition:\s*form-data;\s*(.*)$)~",
std::regex_constants::icase);
std::smatch m;
if (std::regex_match(header, m, re_content_disposition)) {
std::string disposition_params;
if (parse_content_disposition(header, disposition_params)) {
Params params;
parse_disposition_params(m[1], params);
parse_disposition_params(disposition_params, params);
auto it = params.find("name");
if (it != params.end()) {
@@ -3956,13 +3949,14 @@ public:
it = params.find("filename*");
if (it != params.end()) {
// Only allow UTF-8 encoding...
thread_local const std::regex re_rfc5987_encoding(
R"~(^UTF-8''(.+?)$)~", std::regex_constants::icase);
std::smatch m2;
if (std::regex_match(it->second, m2, re_rfc5987_encoding)) {
file_.filename = decode_path_component(m2[1]); // override...
// RFC 5987: only UTF-8 encoding is allowed
const auto &val = it->second;
constexpr const char utf8_prefix[] = "UTF-8''";
constexpr size_t prefix_len = str_len(utf8_prefix);
if (val.size() > prefix_len &&
start_with_case_ignore(val, utf8_prefix)) {
file_.filename = decode_path_component(
val.substr(prefix_len)); // override...
} else {
is_valid_ = false;
return false;
@@ -4030,17 +4024,48 @@ private:
file_.headers.clear();
}
bool start_with_case_ignore(const std::string &a, const char *b) const {
bool start_with_case_ignore(const std::string &a, const char *b,
size_t offset = 0) const {
const auto b_len = strlen(b);
if (a.size() < b_len) { return false; }
if (a.size() < offset + b_len) { return false; }
for (size_t i = 0; i < b_len; i++) {
if (case_ignore::to_lower(a[i]) != case_ignore::to_lower(b[i])) {
if (case_ignore::to_lower(a[offset + i]) != case_ignore::to_lower(b[i])) {
return false;
}
}
return true;
}
// Parses "Content-Disposition: form-data; <params>" without std::regex.
// Returns true if header matches, with the params portion in `params_out`.
bool parse_content_disposition(const std::string &header,
std::string &params_out) const {
constexpr const char prefix[] = "Content-Disposition:";
constexpr size_t prefix_len = str_len(prefix);
if (!start_with_case_ignore(header, prefix)) { return false; }
// Skip whitespace after "Content-Disposition:"
auto pos = prefix_len;
while (pos < header.size() && (header[pos] == ' ' || header[pos] == '\t')) {
pos++;
}
// Match "form-data;" (case-insensitive)
constexpr const char form_data[] = "form-data;";
constexpr size_t form_data_len = str_len(form_data);
if (!start_with_case_ignore(header, form_data, pos)) { return false; }
pos += form_data_len;
// Skip whitespace after "form-data;"
while (pos < header.size() && (header[pos] == ' ' || header[pos] == '\t')) {
pos++;
}
params_out = header.substr(pos);
return true;
}
const std::string dash_ = "--";
const std::string crlf_ = "\r\n";
std::string boundary_;
@@ -4992,9 +5017,10 @@ bool match_hostname(const std::string &pattern,
// Verify certificate using Windows CertGetCertificateChain API.
// This provides real-time certificate validation with Windows Update
// integration, independent of the TLS backend (OpenSSL or MbedTLS).
bool verify_cert_with_windows_schannel(
const std::vector<unsigned char> &der_cert, const std::string &hostname,
bool verify_hostname, unsigned long &out_error) {
bool
verify_cert_with_windows_schannel(const std::vector<unsigned char> &der_cert,
const std::string &hostname,
bool verify_hostname, uint64_t &out_error) {
if (der_cert.empty()) { return false; }
out_error = 0;
@@ -7987,7 +8013,7 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
#else
try {
routed = routing(req, res, strm);
} catch (std::exception &e) {
} catch (std::exception &) {
if (exception_handler_) {
auto ep = std::current_exception();
exception_handler_(req, res, ep);
@@ -11811,7 +11837,7 @@ bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
server_certificate_verification_) {
verify_result_ = tls::get_verify_result(session);
if (verify_result_ != 0) {
last_backend_error_ = static_cast<unsigned long>(verify_result_);
last_backend_error_ = static_cast<uint64_t>(verify_result_);
error = Error::SSLServerVerification;
output_error_log(error, nullptr);
return false;
@@ -11850,7 +11876,7 @@ bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
ca_cert_dir_path_.empty() && ca_cert_pem_.empty()) {
std::vector<unsigned char> der;
if (get_cert_der(server_cert, der)) {
unsigned long wincrypt_error = 0;
uint64_t wincrypt_error = 0;
if (!detail::verify_cert_with_windows_schannel(
der, host_, server_hostname_verification_, wincrypt_error)) {
last_backend_error_ = wincrypt_error;
@@ -11974,16 +12000,26 @@ bool is_ipv4_address(const std::string &str) {
// Parse IPv4 address string to bytes
bool parse_ipv4(const std::string &str, unsigned char *out) {
int parts[4];
if (sscanf(str.c_str(), "%d.%d.%d.%d", &parts[0], &parts[1], &parts[2],
&parts[3]) != 4) {
return false;
}
const char *p = str.c_str();
for (int i = 0; i < 4; i++) {
if (parts[i] < 0 || parts[i] > 255) return false;
out[i] = static_cast<unsigned char>(parts[i]);
if (i > 0) {
if (*p != '.') { return false; }
p++;
}
int val = 0;
int digits = 0;
while (*p >= '0' && *p <= '9') {
val = val * 10 + (*p - '0');
if (val > 255) { return false; }
p++;
digits++;
}
if (digits == 0) { return false; }
// Reject leading zeros (e.g., "01.002.03.04") to prevent ambiguity
if (digits > 1 && *(p - digits) == '0') { return false; }
out[i] = static_cast<unsigned char>(val);
}
return true;
return *p == '\0';
}
#ifdef _WIN32
@@ -13285,11 +13321,11 @@ void update_server_certs_from_x509(ctx_t ctx, X509 *cert, EVP_PKEY *key,
ctx_t create_client_context_from_x509(X509 *cert, EVP_PKEY *key,
const char *password,
unsigned long &out_error) {
uint64_t &out_error) {
out_error = 0;
auto ctx = create_client_context();
if (!ctx) {
out_error = static_cast<unsigned long>(get_error());
out_error = get_error();
return nullptr;
}
@@ -13303,7 +13339,7 @@ ctx_t create_client_context_from_x509(X509 *cert, EVP_PKEY *key,
}
if (!set_client_cert_pem(ctx, cert_pem.c_str(), key_pem.c_str(),
password)) {
out_error = static_cast<unsigned long>(get_error());
out_error = get_error();
free_context(ctx);
return nullptr;
}
+15 -7
View File
@@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.35.0"
#define CPPHTTPLIB_VERSION_NUM "0x002300"
#define CPPHTTPLIB_VERSION "0.37.0"
#define CPPHTTPLIB_VERSION_NUM "0x002500"
/*
* Platform compatibility check
@@ -575,6 +575,14 @@ inline unsigned char to_lower(int c) {
return table[(unsigned char)(char)c];
}
inline std::string to_lower(const std::string &s) {
std::string result = s;
std::transform(
result.begin(), result.end(), result.begin(),
[](unsigned char c) { return static_cast<char>(to_lower(c)); });
return result;
}
inline bool equal(const std::string &a, const std::string &b) {
return a.size() == b.size() &&
std::equal(a.begin(), a.end(), b.begin(), [](char ca, char cb) {
@@ -1859,23 +1867,23 @@ public:
: res_(std::move(res)), err_(err),
request_headers_(std::move(request_headers)), ssl_error_(ssl_error) {}
Result(std::unique_ptr<Response> &&res, Error err, Headers &&request_headers,
int ssl_error, unsigned long ssl_backend_error)
int ssl_error, uint64_t ssl_backend_error)
: res_(std::move(res)), err_(err),
request_headers_(std::move(request_headers)), ssl_error_(ssl_error),
ssl_backend_error_(ssl_backend_error) {}
int ssl_error() const { return ssl_error_; }
unsigned long ssl_backend_error() const { return ssl_backend_error_; }
uint64_t ssl_backend_error() const { return ssl_backend_error_; }
private:
int ssl_error_ = 0;
unsigned long ssl_backend_error_ = 0;
uint64_t ssl_backend_error_ = 0;
#endif
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
public:
[[deprecated("Use ssl_backend_error() instead")]]
unsigned long ssl_openssl_error() const {
uint64_t ssl_openssl_error() const {
return ssl_backend_error_;
}
#endif
@@ -2345,7 +2353,7 @@ protected:
bool server_hostname_verification_ = true;
std::string ca_cert_pem_; // Store CA cert PEM for redirect transfer
int last_ssl_error_ = 0;
unsigned long last_backend_error_ = 0;
uint64_t last_backend_error_ = 0;
#endif
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+47 -27
View File
@@ -1,6 +1,6 @@
/*
Audio playback and capture library. Choice of public domain or MIT-0. See license statements at the end of this file.
miniaudio - v0.11.24 - 2026-01-17
miniaudio - v0.11.25 - 2026-03-04
David Reid - mackron@gmail.com
@@ -3747,7 +3747,7 @@ extern "C" {
#define MA_VERSION_MAJOR 0
#define MA_VERSION_MINOR 11
#define MA_VERSION_REVISION 24
#define MA_VERSION_REVISION 25
#define MA_VERSION_STRING MA_XSTRINGIFY(MA_VERSION_MAJOR) "." MA_XSTRINGIFY(MA_VERSION_MINOR) "." MA_XSTRINGIFY(MA_VERSION_REVISION)
#if defined(_MSC_VER) && !defined(__clang__)
@@ -19358,7 +19358,7 @@ MA_API ma_handle ma_dlopen(ma_log* pLog, const char* filename)
#else
/* *sigh* It appears there is no ANSI version of LoadPackagedLibrary()... */
WCHAR filenameW[4096];
if (MultiByteToWideChar(CP_UTF8, 0, filename, -1, filenameW, sizeof(filenameW)) == 0) {
if (MultiByteToWideChar(CP_UTF8, 0, filename, -1, filenameW, ma_countof(filenameW)) == 0) {
handle = NULL;
} else {
handle = (ma_handle)LoadPackagedLibrary(filenameW, 0);
@@ -41495,18 +41495,37 @@ Web Audio Backend
#ifdef MA_HAS_WEBAUDIO
#include <emscripten/emscripten.h>
#if (__EMSCRIPTEN_major__ > 3) || (__EMSCRIPTEN_major__ == 3 && (__EMSCRIPTEN_minor__ > 1 || (__EMSCRIPTEN_minor__ == 1 && __EMSCRIPTEN_tiny__ >= 32)))
#ifndef MA_EMSCRIPTEN_MAJOR
#if defined(__EMSCRIPTEN_MAJOR__)
#define MA_EMSCRIPTEN_MAJOR __EMSCRIPTEN_MAJOR__
#else
#define MA_EMSCRIPTEN_MAJOR __EMSCRIPTEN_major__
#endif
#endif
#ifndef MA_EMSCRIPTEN_MINOR
#if defined(__EMSCRIPTEN_MINOR__)
#define MA_EMSCRIPTEN_MINOR __EMSCRIPTEN_MINOR__
#else
#define MA_EMSCRIPTEN_MINOR __EMSCRIPTEN_minor__
#endif
#endif
#ifndef MA_EMSCRIPTEN_TINY
#if defined(__EMSCRIPTEN_TINY__)
#define MA_EMSCRIPTEN_TINY __EMSCRIPTEN_TINY__
#else
#define MA_EMSCRIPTEN_TINY __EMSCRIPTEN_tiny__
#endif
#endif
#if (MA_EMSCRIPTEN_MAJOR > 3) || (MA_EMSCRIPTEN_MAJOR == 3 && (MA_EMSCRIPTEN_MINOR > 1 || (MA_EMSCRIPTEN_MINOR == 1 && MA_EMSCRIPTEN_TINY >= 32)))
#include <emscripten/webaudio.h>
#define MA_SUPPORT_AUDIO_WORKLETS
#if (__EMSCRIPTEN_major__ > 3) || (__EMSCRIPTEN_major__ == 3 && (__EMSCRIPTEN_minor__ > 1 || (__EMSCRIPTEN_minor__ == 1 && __EMSCRIPTEN_tiny__ >= 70)))
#if (MA_EMSCRIPTEN_MAJOR > 3) || (MA_EMSCRIPTEN_MAJOR == 3 && (MA_EMSCRIPTEN_MINOR > 1 || (MA_EMSCRIPTEN_MINOR == 1 && MA_EMSCRIPTEN_TINY >= 70)))
#define MA_SUPPORT_AUDIO_WORKLETS_VARIABLE_BUFFER_SIZE
#endif
#endif
/*
TODO: Version 0.12: Swap this logic around so that AudioWorklets are used by default. Add MA_NO_AUDIO_WORKLETS.
*/
#if defined(MA_ENABLE_AUDIO_WORKLETS) && defined(MA_SUPPORT_AUDIO_WORKLETS)
#define MA_USE_AUDIO_WORKLETS
#endif
@@ -59243,6 +59262,10 @@ static ma_result ma_data_source_read_pcm_frames_within_range(ma_data_source* pDa
ma_uint64 framesRead = 0;
ma_bool32 loop = ma_data_source_is_looping(pDataSource);
if (pFramesRead != NULL) {
*pFramesRead = 0;
}
if (pDataSourceBase == NULL) {
return MA_AT_END;
}
@@ -61921,7 +61944,7 @@ extern "C" {
#define MA_DR_WAV_XSTRINGIFY(x) MA_DR_WAV_STRINGIFY(x)
#define MA_DR_WAV_VERSION_MAJOR 0
#define MA_DR_WAV_VERSION_MINOR 14
#define MA_DR_WAV_VERSION_REVISION 4
#define MA_DR_WAV_VERSION_REVISION 5
#define MA_DR_WAV_VERSION_STRING MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_MAJOR) "." MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_MINOR) "." MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_REVISION)
#include <stddef.h>
#define MA_DR_WAVE_FORMAT_PCM 0x1
@@ -80503,6 +80526,13 @@ MA_PRIVATE ma_uint64 ma_dr_wav__read_smpl_to_metadata_obj(ma_dr_wav__metadata_pa
MA_DR_WAV_ASSERT(pChunkHeader != NULL);
if (pMetadata != NULL && bytesJustRead == sizeof(smplHeaderData)) {
ma_uint32 iSampleLoop;
ma_uint32 loopCount;
ma_uint32 calculatedLoopCount;
loopCount = ma_dr_wav_bytes_to_u32(smplHeaderData + 28);
calculatedLoopCount = (pChunkHeader->sizeInBytes - MA_DR_WAV_SMPL_BYTES) / MA_DR_WAV_SMPL_LOOP_BYTES;
if (loopCount != calculatedLoopCount) {
return totalBytesRead;
}
pMetadata->type = ma_dr_wav_metadata_type_smpl;
pMetadata->data.smpl.manufacturerId = ma_dr_wav_bytes_to_u32(smplHeaderData + 0);
pMetadata->data.smpl.productId = ma_dr_wav_bytes_to_u32(smplHeaderData + 4);
@@ -80513,7 +80543,7 @@ MA_PRIVATE ma_uint64 ma_dr_wav__read_smpl_to_metadata_obj(ma_dr_wav__metadata_pa
pMetadata->data.smpl.smpteOffset = ma_dr_wav_bytes_to_u32(smplHeaderData + 24);
pMetadata->data.smpl.sampleLoopCount = ma_dr_wav_bytes_to_u32(smplHeaderData + 28);
pMetadata->data.smpl.samplerSpecificDataSizeInBytes = ma_dr_wav_bytes_to_u32(smplHeaderData + 32);
if (pMetadata->data.smpl.sampleLoopCount == (pChunkHeader->sizeInBytes - MA_DR_WAV_SMPL_BYTES) / MA_DR_WAV_SMPL_LOOP_BYTES) {
if (pMetadata->data.smpl.sampleLoopCount == calculatedLoopCount) {
pMetadata->data.smpl.pLoops = (ma_dr_wav_smpl_loop*)ma_dr_wav__metadata_get_memory(pParser, sizeof(ma_dr_wav_smpl_loop) * pMetadata->data.smpl.sampleLoopCount, MA_DR_WAV_METADATA_ALIGNMENT);
for (iSampleLoop = 0; iSampleLoop < pMetadata->data.smpl.sampleLoopCount; ++iSampleLoop) {
ma_uint8 smplLoopData[MA_DR_WAV_SMPL_LOOP_BYTES];
@@ -80534,6 +80564,8 @@ MA_PRIVATE ma_uint64 ma_dr_wav__read_smpl_to_metadata_obj(ma_dr_wav__metadata_pa
MA_DR_WAV_ASSERT(pMetadata->data.smpl.pSamplerSpecificData != NULL);
ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.smpl.pSamplerSpecificData, pMetadata->data.smpl.samplerSpecificDataSizeInBytes, &totalBytesRead);
}
} else {
MA_DR_WAV_ZERO_OBJECT(&pMetadata->data.smpl);
}
}
return totalBytesRead;
@@ -83149,19 +83181,13 @@ MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__msadpcm(ma_dr_wav* pWav, ma_
newSample0 = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
newSample0 += nibble0 * pWav->msadpcm.delta[0];
newSample0 = ma_dr_wav_clamp(newSample0, -32768, 32767);
pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
if (pWav->msadpcm.delta[0] < 16) {
pWav->msadpcm.delta[0] = 16;
}
pWav->msadpcm.delta[0] = (ma_int32)ma_dr_wav_clamp(((ma_int64)adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8, 16, 0x7FFFFFFF);
pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
pWav->msadpcm.prevFrames[0][1] = newSample0;
newSample1 = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
newSample1 += nibble1 * pWav->msadpcm.delta[0];
newSample1 = ma_dr_wav_clamp(newSample1, -32768, 32767);
pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[0]) >> 8;
if (pWav->msadpcm.delta[0] < 16) {
pWav->msadpcm.delta[0] = 16;
}
pWav->msadpcm.delta[0] = (ma_int32)ma_dr_wav_clamp(((ma_int64)adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[0]) >> 8, 16, 0x7FFFFFFF);
pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
pWav->msadpcm.prevFrames[0][1] = newSample1;
pWav->msadpcm.cachedFrames[2] = newSample0;
@@ -83176,10 +83202,7 @@ MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__msadpcm(ma_dr_wav* pWav, ma_
newSample0 = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
newSample0 += nibble0 * pWav->msadpcm.delta[0];
newSample0 = ma_dr_wav_clamp(newSample0, -32768, 32767);
pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
if (pWav->msadpcm.delta[0] < 16) {
pWav->msadpcm.delta[0] = 16;
}
pWav->msadpcm.delta[0] = (ma_int32)ma_dr_wav_clamp(((ma_int64)adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8, 16, 0x7FFFFFFF);
pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
pWav->msadpcm.prevFrames[0][1] = newSample0;
if (pWav->msadpcm.predictor[1] >= ma_dr_wav_countof(coeff1Table) || pWav->msadpcm.predictor[1] >= ma_dr_wav_countof(coeff2Table)) {
@@ -83188,10 +83211,7 @@ MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__msadpcm(ma_dr_wav* pWav, ma_
newSample1 = ((pWav->msadpcm.prevFrames[1][1] * coeff1Table[pWav->msadpcm.predictor[1]]) + (pWav->msadpcm.prevFrames[1][0] * coeff2Table[pWav->msadpcm.predictor[1]])) >> 8;
newSample1 += nibble1 * pWav->msadpcm.delta[1];
newSample1 = ma_dr_wav_clamp(newSample1, -32768, 32767);
pWav->msadpcm.delta[1] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[1]) >> 8;
if (pWav->msadpcm.delta[1] < 16) {
pWav->msadpcm.delta[1] = 16;
}
pWav->msadpcm.delta[1] = (ma_int32)ma_dr_wav_clamp(((ma_int64)adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[1]) >> 8, 16, 0x7FFFFFFF);
pWav->msadpcm.prevFrames[1][0] = pWav->msadpcm.prevFrames[1][1];
pWav->msadpcm.prevFrames[1][1] = newSample1;
pWav->msadpcm.cachedFrames[2] = newSample0;
@@ -95825,7 +95845,7 @@ For more information, please refer to <http://unlicense.org/>
===============================================================================
ALTERNATIVE 2 - MIT No Attribution
===============================================================================
Copyright 2025 David Reid
Copyright 2026 David Reid
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in