Compare commits

...

10 Commits

Author SHA1 Message Date
Xuan-Son Nguyen 2c1f199653 cli : fix reasoning responses in CLI (#18961)
* cli : fix reasoning responses in CLI

* fix build

* fix build (2)
2026-01-20 18:23:25 +01:00
Oliver Simons d1e3556481 CUDA: Replace init_offsets kernel with iterators in cub-based argsort (#18930)
* CUDA: Replace `init_offsets` with iterators in argsort

This is a QOL improvement, saving us the cost of materializing the
iterator

* Remove unnecessary include from top-k.cu
2026-01-20 20:11:01 +08:00
Adrien Gallouët 08f3f4a8a3 ggml : cleanup path_str() (#18928)
- Remove pragmas as `std::codecvt_utf8` is not used.
- Avoid implicit `strlen()`.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-01-20 11:42:49 +01:00
Georgi Gerganov 271191906c metal : enable FA for MLA heads (#18950) 2026-01-20 12:21:28 +02:00
Daniel Bevenius 7dee9ff59a convert : use n_groups instead of hardcoded values in reshape (#18929)
* convert : use n_groups instead of hardcoded values in reshape

This commit modifies the conversion script for NemotronHModel to use
the 'n_groups' hyperparameter, and allow Python to calculate the the
last dimension, using -1, when reshaping the 'mixer.norm.weight' tensor.

* use self.n_group instead of self.hparams["n_groups"]
2026-01-20 06:55:24 +01:00
Xuan-Son Nguyen 6df686bee6 server : refactor oai_parser_opt, move it to server_chat_params (#18937)
* server_chat_params

* move chat format into CLI

* use meta whenever possible

* clean up, no more chatml fallback
2026-01-19 23:28:01 +01:00
ddh0 1706a6d7c6 convert : support Glm4MoeLite (#18936)
* initial commit for branch

* add glm-4.7-flash, move tokenizer hash

* use `glm4` pretok

* silence flake8 E302 (CI)

* apply review feedback

* add <|user|> as eog

* also add EOG `<|observation|>`

* revert llama-vocab

* inherit vocab from glm4

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2026-01-19 23:09:20 +01:00
Sigbjørn Skjæret 959ecf7f23 jinja : fix undefined keys and attributes and int/float as bool (#18924)
* fix undefined keys and attributes

* add falsy tests

* as_bool for integers and floats

* more falsy/truthy tests

* --typo
2026-01-19 20:29:43 +01:00
Sigbjørn Skjæret 4037093c66 ci : run test-jinja -py on high perf [no ci] (#18916) 2026-01-19 20:29:15 +01:00
Lennart Austenfeld 18361c579c server: fix memory reservations in populate_token_probs (#18787) 2026-01-19 19:13:31 +01:00
29 changed files with 674 additions and 546 deletions
+1 -1
View File
@@ -254,7 +254,7 @@ function gg_run_ctest_release {
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
(time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi
+3 -3
View File
@@ -129,7 +129,7 @@ static void parse_json_tool_calls(
}
}
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax)
: input_(input), is_partial_(is_partial), syntax_(syntax)
{
result_.role = "assistant";
@@ -1611,7 +1611,7 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
builder.finish();
}
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
@@ -1635,7 +1635,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
return msg;
}
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
if (parser.empty()) {
throw std::runtime_error("Failed to parse due to missing parser definition.");
}
+4 -4
View File
@@ -5,7 +5,7 @@
#include "json-partial.h"
#include "regex-partial.h"
#include <nlohmann/json.hpp>
#include <nlohmann/json_fwd.hpp>
#include <optional>
#include <string>
@@ -19,20 +19,20 @@ class common_chat_msg_partial_exception : public std::runtime_error {
class common_chat_msg_parser {
std::string input_;
bool is_partial_;
common_chat_syntax syntax_;
common_chat_parser_params syntax_; // TODO: rename to params
std::string healing_marker_;
size_t pos_ = 0;
common_chat_msg result_;
public:
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
const std::string & input() const { return input_; }
size_t pos() const { return pos_; }
const std::string & healing_marker() const { return healing_marker_; }
const bool & is_partial() const { return is_partial_; }
const common_chat_msg & result() const { return result_; }
const common_chat_syntax & syntax() const { return syntax_; }
const common_chat_parser_params & syntax() const { return syntax_; }
void move_to(size_t pos) {
if (pos > input_.size()) {
+7 -7
View File
@@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
return tmpls->has_explicit_template;
}
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
if (variant != nullptr) {
if (strcmp(variant, "tool_use") == 0) {
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
if (!variant.empty()) {
if (variant == "tool_use") {
if (tmpls->template_tool_use) {
return tmpls->template_tool_use->source().c_str();
return tmpls->template_tool_use->source();
}
return nullptr;
return "";
} else {
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
}
}
return tmpls->template_default->source().c_str();
return tmpls->template_default->source();
}
common_chat_templates_ptr common_chat_templates_init(
+17 -8
View File
@@ -145,7 +145,7 @@ struct common_chat_templates_inputs {
std::vector<common_chat_tool> tools;
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
bool parallel_tool_calls = false;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking"
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::map<std::string, std::string> chat_template_kwargs;
@@ -165,14 +165,21 @@ struct common_chat_params {
std::string parser;
};
struct common_chat_syntax {
// per-message parsing syntax
// should be derived from common_chat_params
struct common_chat_parser_params {
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
bool thinking_forced_open = false;
bool parse_tool_calls = true;
common_peg_arena parser = {};
common_chat_parser_params() = default;
common_chat_parser_params(const common_chat_params & chat_params) {
format = chat_params.format;
thinking_forced_open = chat_params.thinking_forced_open;
}
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
@@ -191,7 +198,7 @@ common_chat_templates_ptr common_chat_templates_init(
const std::string & eos_token_override = "");
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
struct common_chat_params common_chat_templates_apply(
@@ -213,10 +220,12 @@ std::string common_chat_format_example(
const std::map<std::string, std::string> & chat_template_kwargs);
const char* common_chat_format_name(common_chat_format format);
const char* common_reasoning_format_name(common_reasoning_format format);
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
// used by arg and server
const char * common_reasoning_format_name(common_reasoning_format format);
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
+1
View File
@@ -284,6 +284,7 @@ struct common_params_diffusion {
};
// reasoning API response format (not to be confused as chat template's reasoning format)
// only used by server
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
+2 -2
View File
@@ -805,7 +805,7 @@ value member_expression::execute_impl(context & ctx) {
} else if (is_val<value_string>(property)) {
auto key = property->as_string().str();
JJ_DEBUG("Accessing %s built-in '%s'", is_val<value_array>(object) ? "array" : "string", key.c_str());
val = try_builtin_func(ctx, key, object);
val = try_builtin_func(ctx, key, object, true);
} else {
throw std::runtime_error("Cannot access property with non-string/non-number: got " + property->type());
}
@@ -814,7 +814,7 @@ value member_expression::execute_impl(context & ctx) {
throw std::runtime_error("Cannot access property with non-string: got " + property->type());
}
auto key = property->as_string().str();
val = try_builtin_func(ctx, key, object);
val = try_builtin_func(ctx, key, object, true);
}
if (ctx.is_get_stats && val && object && property) {
+6
View File
@@ -203,6 +203,9 @@ struct value_int_t : public value_t {
virtual int64_t as_int() const override { return val_int; }
virtual double as_float() const override { return static_cast<double>(val_int); }
virtual string as_string() const override { return std::to_string(val_int); }
virtual bool as_bool() const override {
return val_int != 0;
}
virtual const func_builtins & get_builtins() const override;
};
using value_int = std::shared_ptr<value_int_t>;
@@ -219,6 +222,9 @@ struct value_float_t : public value_t {
if (out.back() == '.') out.push_back('0'); // leave one zero if no decimals
return out;
}
virtual bool as_bool() const override {
return val_flt != 0.0;
}
virtual const func_builtins & get_builtins() const override;
};
using value_float = std::shared_ptr<value_float_t>;
+1
View File
@@ -1,5 +1,6 @@
#pragma once
// TODO: use json_fwd.hpp when possible
#include <nlohmann/json.hpp>
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
+31 -2
View File
@@ -1078,6 +1078,9 @@ class TextModel(ModelBase):
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
# ref: https://huggingface.co/aari1995/German_Semantic_V3
res = "jina-v2-de"
if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
# ref: https://huggingface.co/zai-org/GLM-4.7-Flash
res = "glm4"
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
@@ -7458,7 +7461,7 @@ class DeepseekModel(TextModel):
"DeepseekV3ForCausalLM",
"KimiVLForConditionalGeneration",
"YoutuForCausalLM",
"YoutuVLForConditionalGeneration"
"YoutuVLForConditionalGeneration",
)
class DeepseekV2Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -8446,6 +8449,32 @@ class Glm4MoeModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("Glm4MoeLiteForCausalLM")
class Glm4MoeLiteModel(DeepseekV2Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
# copied from Glm4MoeModel
def set_vocab(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
# Special tokens
# Note: Using <|endoftext|> (151329) for eot causes endless generation
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
special_vocab.add_to_gguf(self.gguf_writer)
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
class ChatGLMModel(TextModel):
model_arch = gguf.MODEL_ARCH.CHATGLM
@@ -9183,7 +9212,7 @@ class NemotronHModel(GraniteHybridModel):
return [(mapped_name, reshaped_data)]
if name.endswith("mixer.norm.weight"):
reshaped_data = data_torch.reshape(8, 512)
reshaped_data = data_torch.reshape(self.n_group, -1)
mapped_name = self.map_tensor_name(name)
return [(mapped_name, reshaped_data)]
+1
View File
@@ -170,6 +170,7 @@ pre_computed_hashes = [
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
# jina-v2-de variants
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
]
+4 -20
View File
@@ -77,39 +77,23 @@
#include "ggml-zendnn.h"
#endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
namespace fs = std::filesystem;
static std::string path_str(const fs::path & path) {
std::string u8path;
try {
#if defined(__cpp_lib_char8_t)
// C++20 and later: u8string() returns std::u8string
std::u8string u8str = path.u8string();
u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
const std::u8string u8str = path.u8string();
return std::string(reinterpret_cast<const char *>(u8str.data()), u8str.size());
#else
// C++17: u8string() returns std::string
u8path = path.u8string();
return path.u8string();
#endif
} catch (...) {
return std::string();
}
return u8path;
}
#if defined(__clang__)
# pragma clang diagnostic pop
#elif defined(__GNUC__)
# pragma GCC diagnostic pop
#endif
#ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>;
+7 -15
View File
@@ -14,12 +14,6 @@ static __global__ void init_indices(int * indices, const int ncols, const int nr
}
}
static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx <= nrows) {
offsets[idx] = idx * ncols;
}
}
#ifdef GGML_CUDA_USE_CUB
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
@@ -31,18 +25,15 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
cudaStream_t stream) {
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
int * temp_indices = temp_indices_alloc.get();
float * temp_keys = temp_keys_alloc.get();
int * d_offsets = offsets_alloc.get();
static const int block_size = 256;
const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
const dim3 offset_grid((nrows + block_size - 1) / block_size);
init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols);
CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
@@ -57,7 +48,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols * nrows, nrows, // num items, num segments
d_offsets, d_offsets + 1, stream);
offset_iterator, offset_iterator + 1, stream);
}
} else {
if (nrows == 1) {
@@ -66,7 +57,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
ncols, 0, sizeof(float) * 8, stream);
} else {
DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
dst, ncols * nrows, nrows, offset_iterator, offset_iterator + 1,
stream);
}
}
@@ -80,7 +72,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
ncols, 0, sizeof(float) * 8, stream);
} else {
DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream);
}
} else {
if (nrows == 1) {
@@ -89,8 +81,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
ncols, 0, sizeof(float) * 8, stream);
} else {
DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
stream);
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
offset_iterator + 1, stream);
}
}
}
-1
View File
@@ -4,7 +4,6 @@
#ifdef GGML_CUDA_USE_CUB
# include <cub/cub.cuh>
# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
# include <cuda/iterator>
# define CUB_TOP_K_AVAILABLE
using namespace cub;
# endif // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2
+2 -6
View File
@@ -1078,12 +1078,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
op->src[0]->ne[0] != 112 &&
op->src[0]->ne[0] != 128 &&
op->src[0]->ne[0] != 192 &&
op->src[0]->ne[0] != 256) {
return false;
}
if (op->src[0]->ne[0] == 576) {
// DeepSeek sizes
// TODO: disabled for now, until optmized
op->src[0]->ne[0] != 256 &&
op->src[0]->ne[0] != 576) {
return false;
}
if (op->src[1]->type != op->src[2]->type) {
+1 -1
View File
@@ -2520,7 +2520,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
// simdgroups per threadgroup (a.k.a. warps)
//nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
int32_t nsg = 4;
int32_t nsg = ne00 >= 512 ? 8 : 4;
const size_t smem = FATTN_SMEM(nsg);
+8 -5
View File
@@ -5552,9 +5552,7 @@ void kernel_flash_attn_ext_impl(
constexpr short NC = (C/8)/NSG;
// note: do not unroll for large heads
#pragma unroll (DK <= 64 ? NC : 1)
for (short cc = 0; cc < NC; ++cc) {
FOR_UNROLL (short cc = 0; cc < NC; ++cc) {
qk8x8_t mqk = make_filled_simdgroup_matrix<qk_t, 8>((qk_t) 0.0f);
if (DK % 16 != 0) {
@@ -5575,7 +5573,9 @@ void kernel_flash_attn_ext_impl(
k8x8_t mk[2];
q8x8_t mq[2];
FOR_UNROLL (short i = 0; i < DK8/2; ++i) {
// note: too much unroll can tank the performance for large heads
#pragma unroll (MIN(DK8/2, 4*NSG))
for (short i = 0; i < DK8/2; ++i) {
simdgroup_barrier(mem_flags::mem_none);
simdgroup_load(mq[0], pq + 0*8 + 16*i, DK);
@@ -5749,7 +5749,9 @@ void kernel_flash_attn_ext_impl(
pv += 8*NS20;
}
} else {
FOR_UNROLL (short cc = 0; cc < (C/8)/2; ++cc) {
constexpr short NC = (C/8)/2;
FOR_UNROLL (short cc = 0; cc < NC; ++cc) {
s8x8_t vs[2];
simdgroup_load(vs[0], ss + 16*cc + 0, SH, 0, false);
@@ -5952,6 +5954,7 @@ kernel void kernel_flash_attn_ext(
//case 1: kernel_flash_attn_ext_impl<FWD_TMPL, 1>(FWD_ARGS); break;
//case 2: kernel_flash_attn_ext_impl<FWD_TMPL, 2>(FWD_ARGS); break;
case 4: kernel_flash_attn_ext_impl<FWD_TMPL, 4>(FWD_ARGS); break;
case 8: kernel_flash_attn_ext_impl<FWD_TMPL, 8>(FWD_ARGS); break;
}
#undef FWD_TMPL
#undef FWD_ARGS
+1
View File
@@ -187,6 +187,7 @@ llama_build_and_test(test-chat-parser.cpp)
llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
llama_build_and_test(test-chat-template.cpp)
llama_build_and_test(test-jinja.cpp)
llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
llama_build_and_test(test-json-partial.cpp)
llama_build_and_test(test-log.cpp)
llama_build_and_test(
+114 -126
View File
@@ -54,113 +54,109 @@ static void assert_throws(const std::function<void()> & fn, const std::string &
static void test_reasoning() {
//common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG);
{
common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
});
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
assert_equals("<tnk>Cogito</tnk>Ergo sum", builder.consume_rest());
}
{
common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
});
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
assert_equals(std::string("Cogito"), builder.result().reasoning_content);
assert_equals("Ergo sum", builder.consume_rest());
}
{
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
});
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
assert_equals("Cogito</tnk>Ergo sum", builder.consume_rest());
}
{
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
});
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = true;
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
assert_equals(std::string("Cogito"), builder.result().reasoning_content);
assert_equals("Ergo sum", builder.consume_rest());
}
{
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ true,
/* .thinking_forced_open = */ true,
});
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = true;
params.thinking_forced_open = true;
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
assert_equals("<think>Cogito</think>", builder.result().content);
assert_equals("Ergo sum", builder.consume_rest());
}
{
const std::string variant("content_only_inline_think");
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
params.parse_tool_calls = false;
const std::string input = "<think>Pense</think>Bonjour";
auto msg = common_chat_parse(input, false, syntax);
auto msg = common_chat_parse(input, false, params);
assert_equals(variant, std::string("Pense"), msg.reasoning_content);
assert_equals(variant, std::string("Bonjour"), msg.content);
}
{
const std::string variant("llama_3_inline_think");
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_LLAMA_3_X;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
params.parse_tool_calls = false;
const std::string input = "<think>Plan</think>Réponse";
auto msg = common_chat_parse(input, false, syntax);
auto msg = common_chat_parse(input, false, params);
assert_equals(variant, std::string("Plan"), msg.reasoning_content);
assert_equals(variant, std::string("Réponse"), msg.content);
}
// Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = true;
params.parse_tool_calls = true;
const std::string variant("deepseek_v3_1_reasoning_format_deepseek");
common_chat_msg_parser builder("REASONING</think>ok", /* is_partial= */ false, syntax);
common_chat_msg_parser builder("REASONING</think>ok", /* is_partial= */ false, params);
assert_equals(variant, true, builder.try_parse_reasoning("<think>", "</think>"));
assert_equals(variant, std::string("REASONING"), builder.result().reasoning_content);
assert_equals(variant, std::string("ok"), builder.consume_rest());
}
// Test DeepSeek V3.1 parsing - reasoning_format none - reasoning content followed by "</think>" and then regular content
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
params.reasoning_in_content = false;
params.thinking_forced_open = true;
params.parse_tool_calls = true;
const std::string variant("deepseek_v3_1_reasoning_format_none");
const std::string input = "REASONING</think>ok";
auto msg = common_chat_parse(input, false, syntax);
auto msg = common_chat_parse(input, false, params);
assert_equals(variant, std::string("REASONING</think>ok"), msg.content);
assert_equals(variant, std::string(""), msg.reasoning_content);
}
@@ -256,15 +252,14 @@ static void test_deepseek_v3_1_tool_calls() {
//common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG);
// variant: happy path for when it works as the model card says it should
const std::string variant("simple");
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
params.parse_tool_calls = true;
const std::string input = "<tool▁calls▁begin><tool▁call▁begin>get_time<tool▁sep>{\"city\": \"Tokyo\"}<tool▁call▁end><tool▁calls▁end>";
auto msg = common_chat_parse(input, false, syntax);
auto msg = common_chat_parse(input, false, params);
assert_equals<std::size_t>(variant, 1, msg.tool_calls.size());
assert_equals(variant, std::string("get_time"), msg.tool_calls[0].name);
// JSON arguments are dumped without spaces
@@ -274,16 +269,15 @@ static void test_deepseek_v3_1_tool_calls() {
// variant: simple + thinking open
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = true;
params.parse_tool_calls = true;
const std::string variant("simple_thinking");
const std::string in = "REASONING</think><tool▁calls▁begin><tool▁call▁begin>get_time<tool▁sep>{\"city\": \"Tokyo\"}<tool▁call▁end><tool▁calls▁end>";
auto m = common_chat_parse(in, false, syntax);
auto m = common_chat_parse(in, false, params);
assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
@@ -292,16 +286,15 @@ static void test_deepseek_v3_1_tool_calls() {
}
// variant: simple + multiple tool calls
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
params.parse_tool_calls = true;
const std::string variant("simple_multiple_tool_calls");
const std::string in = "CONTENT<tool▁calls▁begin><tool▁call▁begin>get_time<tool▁sep>{\"city\": \"Paris\"}<tool▁call▁end><tool▁call▁begin>get_weather<tool▁sep>{\"city\": \"Paris\"}<tool▁call▁end><tool▁calls▁end>";
auto m = common_chat_parse(in, false, syntax);
auto m = common_chat_parse(in, false, params);
assert_equals<std::size_t>(variant, 2, m.tool_calls.size());
assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
assert_equals(variant, std::string("{\"city\":\"Paris\"}"), m.tool_calls[0].arguments);
@@ -314,16 +307,15 @@ static void test_deepseek_v3_1_tool_calls() {
// variant: thinking forced open + tool call in reasoning content
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = true;
params.parse_tool_calls = true;
const std::string variant("thinking_forced_open_tool_call_in_reasoning");
const std::string in = "REASONING<tool▁calls▁begin><tool▁call▁begin>get_time2<tool▁sep>{\"city\": \"Tokyo2\"}<tool▁call▁end><tool▁calls▁end>REASONING</think><tool▁calls▁begin><tool▁call▁begin>get_time<tool▁sep>{\"city\": \"Tokyo\"}<tool▁call▁end><tool▁calls▁end>";
auto m = common_chat_parse(in, false, syntax);
auto m = common_chat_parse(in, false, params);
assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
@@ -336,16 +328,15 @@ static void test_deepseek_v3_1_tool_calls() {
// to make tool calls in reasoning content according to the model card, but it does sometimes, so
// add the reasoning content as regular content and parse the tool calls.
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = true;
params.parse_tool_calls = true;
const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_not_partial");
const std::string in = "REASONING<tool▁calls▁begin><tool▁call▁begin>get_time<tool▁sep>{\"city\": \"Tokyo\"}<tool▁call▁end><tool▁calls▁end>";
auto m = common_chat_parse(in, false, syntax);
auto m = common_chat_parse(in, false, params);
assert_equals(variant, std::string("REASONING"), m.content);
assert_equals(variant, std::string(""), m.reasoning_content);
assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
@@ -355,16 +346,15 @@ static void test_deepseek_v3_1_tool_calls() {
// variant: thinking forced open + tool call in reasoning content + no closing think + partial
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = true;
params.parse_tool_calls = true;
const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_partial");
const std::string in = "REASONING<tool▁calls▁begin><tool▁call▁begin>get_time<tool▁sep>{\"city\": \"Tokyo\"}<tool▁call▁end><tool▁calls▁end>";
auto m = common_chat_parse(in, /* is_partial= */ true, syntax);
auto m = common_chat_parse(in, /* is_partial= */ true, params);
assert_equals(variant, std::string("REASONING<tool▁calls▁begin><tool▁call▁begin>get_time<tool▁sep>{\"city\": \"Tokyo\"}<tool▁call▁end><tool▁calls▁end>"), m.reasoning_content);
assert_equals(variant, std::string(""), m.content);
assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
@@ -372,32 +362,30 @@ static void test_deepseek_v3_1_tool_calls() {
// variant: thinking not forced open + reasoning + regular content + no tool calls
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = true;
params.parse_tool_calls = true;
const std::string variant("thinking_forced_open_reasoning_regular_content_no_tool_calls");
const std::string in = "REASONING</think>CONTENT";
auto m = common_chat_parse(in, false, syntax);
auto m = common_chat_parse(in, false, params);
assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
assert_equals(variant, std::string("CONTENT"), m.content);
assert_equals(variant, std::string("REASONING"), m.reasoning_content);
}
// variant: thinking not forced open + missing reasoning + no tool calls
{
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ true,
};
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
params.parse_tool_calls = true;
const std::string variant("thinking_not_forced_open_missing_reasoning_no_tool_calls");
const std::string in = "CONTENT";
auto m = common_chat_parse(in, false, syntax);
auto m = common_chat_parse(in, false, params);
assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
assert_equals(variant, std::string("CONTENT"), m.content);
assert_equals(variant, std::string(""), m.reasoning_content);
+6 -6
View File
@@ -616,15 +616,15 @@ void test_command7_parser_compare(testing & t) {
auto test_legacy = [&](const std::string & input, bool need_more_input, bool print_results) {
// Original common_chat_combinator_parser taken from chat.cpp
common_chat_parser_params params;
params.format = COMMON_CHAT_FORMAT_GENERIC;
params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
params.reasoning_in_content = false;
params.thinking_forced_open = false;
common_chat_msg_parser builder(
input,
/* .is_partial = */ need_more_input,
{
/* .format = */ COMMON_CHAT_FORMAT_GENERIC,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}
params
);
builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
+238 -216
View File
File diff suppressed because it is too large Load Diff
+78
View File
@@ -191,6 +191,84 @@ static void test_conditionals(testing & t) {
json::object(),
"yes"
);
test_template(t, "is undefined falsy",
"{{ 'yes' if not y else 'no' }}",
json::object(),
"yes"
);
test_template(t, "is undefined attribute falsy",
"{{ 'yes' if not y.x else 'no' }}",
{{"y", true}},
"yes"
);
test_template(t, "is undefined key falsy",
"{{ 'yes' if not y['x'] else 'no' }}",
{{"y", {{}}}},
"yes"
);
test_template(t, "is empty array falsy",
"{{ 'yes' if not y else 'no' }}",
{{"y", json::array()}},
"yes"
);
test_template(t, "is empty object falsy",
"{{ 'yes' if not y else 'no' }}",
{{"y", json::object()}},
"yes"
);
test_template(t, "is empty string falsy",
"{{ 'yes' if not y else 'no' }}",
{{"y", ""}},
"yes"
);
test_template(t, "is 0 falsy",
"{{ 'yes' if not y else 'no' }}",
{{"y", 0}},
"yes"
);
test_template(t, "is 0.0 falsy",
"{{ 'yes' if not y else 'no' }}",
{{"y", 0.0}},
"yes"
);
test_template(t, "is non-empty array truthy",
"{{ 'yes' if y else 'no' }}",
{{"y", json::array({""})}},
"yes"
);
test_template(t, "is non-empty object truthy",
"{{ 'yes' if y else 'no' }}",
{{"y", {"x", false}}},
"yes"
);
test_template(t, "is non-empty string truthy",
"{{ 'yes' if y else 'no' }}",
{{"y", "0"}},
"yes"
);
test_template(t, "is 1 truthy",
"{{ 'yes' if y else 'no' }}",
{{"y", 1}},
"yes"
);
test_template(t, "is 1.0 truthy",
"{{ 'yes' if y else 'no' }}",
{{"y", 1.0}},
"yes"
);
}
static void test_loops(testing & t) {
+31 -6
View File
@@ -66,19 +66,25 @@ struct cli_context {
defaults.stream = true; // make sure we always use streaming mode
defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
// defaults.return_progress = true; // TODO: show progress
defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
}
std::string generate_completion(result_timings & out_timings) {
server_response_reader rd = ctx_server.get_response_reader();
auto chat_params = format_chat();
{
// TODO: reduce some copies here in the future
server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
task.id = rd.get_new_id();
task.index = 0;
task.params = defaults; // copy
task.cli_input = messages; // copy
task.cli_files = input_files; // copy
task.id = rd.get_new_id();
task.index = 0;
task.params = defaults; // copy
task.cli_prompt = chat_params.prompt; // copy
task.cli_files = input_files; // copy
task.cli = true;
// chat template settings
task.params.chat_parser_params = common_chat_parser_params(chat_params);
task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
rd.post_task({std::move(task)});
}
@@ -156,6 +162,25 @@ struct cli_context {
return content;
}
}
common_chat_params format_chat() {
auto meta = ctx_server.get_meta();
auto & chat_params = meta.chat_params;
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
inputs.tools = {}; // TODO
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
inputs.json_schema = ""; // TODO
inputs.grammar = ""; // TODO
inputs.use_jinja = chat_params.use_jinja;
inputs.parallel_tool_calls = false;
inputs.add_generation_prompt = true;
inputs.enable_thinking = chat_params.enable_thinking;
// Apply chat template to the list of messages
return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
}
};
int main(int argc, char ** argv) {
+2 -2
View File
@@ -831,7 +831,7 @@ static void handle_media(
// used by /chat/completions endpoint
json oaicompat_chat_params_parse(
json & body, /* openai api json semantics */
const oaicompat_parser_options & opt,
const server_chat_params & opt,
std::vector<raw_buffer> & out_files)
{
json llama_params;
@@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse(
}
// Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
/* Append assistant prefilled message */
if (prefill_assistant_message) {
+8 -7
View File
@@ -274,25 +274,26 @@ std::vector<server_tokens> tokenize_input_prompts(
// OAI utils
//
// used by /completions endpoint
json oaicompat_completion_params_parse(const json & body);
struct oaicompat_parser_options {
// global server parameters for chat formatting / parsing
struct server_chat_params {
bool use_jinja;
bool prefill_assistant;
common_reasoning_format reasoning_format;
std::map<std::string,std::string> chat_template_kwargs;
common_chat_templates * tmpls;
std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
common_chat_templates_ptr tmpls;
bool allow_image;
bool allow_audio;
bool enable_thinking = true;
std::string media_path;
};
// used by /completions endpoint
json oaicompat_completion_params_parse(const json & body);
// used by /chat/completions endpoint
json oaicompat_chat_params_parse(
json & body, /* openai api json semantics */
const oaicompat_parser_options & opt,
const server_chat_params & opt,
std::vector<raw_buffer> & out_files);
// convert Anthropic Messages API format to OpenAI Chat Completions API format
+70 -81
View File
@@ -534,8 +534,8 @@ public:
server_queue queue_tasks;
server_response queue_results;
common_chat_templates_ptr chat_templates;
oaicompat_parser_options oai_parser_opt;
// note: chat_params must not be refreshed upon existing sleeping state
server_chat_params chat_params;
~server_context_impl() {
if (!sleeping) {
@@ -688,15 +688,6 @@ private:
llama_init_dft->free_context();
}
chat_templates = common_chat_templates_init(model, params_base.chat_template);
try {
common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
} catch (const std::exception & e) {
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
chat_templates = common_chat_templates_init(model, "chatml");
}
std::string & mmproj_path = params_base.mmproj.path;
if (!mmproj_path.empty()) {
if (!is_resume) {
@@ -845,30 +836,6 @@ private:
model_name = model_path.filename().string();
}
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 2. The chat template supports it
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
SRV_INF("thinking = %d\n", enable_thinking);
oai_parser_opt = {
/* use_jinja */ params_base.use_jinja,
/* prefill_assistant */ params_base.prefill_assistant,
/* reasoning_format */ params_base.reasoning_format,
/* chat_template_kwargs */ params_base.default_template_kwargs,
/* common_chat_templates */ chat_templates.get(),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
/* enable_thinking */ enable_thinking,
/* media_path */ params_base.media_path,
};
// print sample chat example to make it clear which template is used
// @ngxson modern templates are too long, spam the logs; printing the example is enough
LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
// common_chat_templates_source(chat_templates.get()),
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
if (!is_resume) {
return init();
}
@@ -907,6 +874,42 @@ private:
}
}
// populate chat template params
{
common_chat_templates_ptr chat_templates;
try {
chat_templates = common_chat_templates_init(model, params_base.chat_template);
LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
} catch (const std::exception & e) {
SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
return false;
}
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 2. The chat template supports it
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
chat_params = {
/* use_jinja */ params_base.use_jinja,
/* prefill_assistant */ params_base.prefill_assistant,
/* reasoning_format */ params_base.reasoning_format,
/* chat_template_kwargs */ params_base.default_template_kwargs,
/* tmpls */ std::move(chat_templates),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
/* enable_thinking */ enable_thinking,
/* media_path */ params_base.media_path,
};
}
return true;
}
@@ -1326,11 +1329,12 @@ private:
}
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
const size_t n_probs = slot.task->params.sampling.n_probs;
const size_t n_probs_request = slot.task->params.sampling.n_probs;
if (post_sampling) {
const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true);
const size_t max_probs = cur_p->size;
const size_t n_probs = std::min(max_probs, n_probs_request);
// set probability for sampled token
for (size_t i = 0; i < max_probs; i++) {
@@ -1341,8 +1345,8 @@ private:
}
// set probability for top n_probs tokens
result.probs.reserve(max_probs);
for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
result.probs.reserve(n_probs);
for (size_t i = 0; i < n_probs; i++) {
result.probs.push_back({
cur_p->data[i].id,
common_token_to_piece(ctx, cur_p->data[i].id, special),
@@ -1352,9 +1356,11 @@ private:
} else {
// TODO: optimize this with min-p optimization
std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
const size_t max_probs = cur.size();
const size_t n_probs = std::min(max_probs, n_probs_request);
// set probability for sampled token
for (size_t i = 0; i < cur.size(); i++) {
for (size_t i = 0; i < max_probs; i++) {
// set probability for sampled token
if (cur[i].id == result.tok) {
result.prob = cur[i].p;
@@ -1364,7 +1370,7 @@ private:
// set probability for top n_probs tokens
result.probs.reserve(n_probs);
for (size_t i = 0; i < std::min(cur.size(), n_probs); i++) {
for (size_t i = 0; i < n_probs; i++) {
result.probs.push_back({
cur[i].id,
common_token_to_piece(ctx, cur[i].id, special),
@@ -1585,32 +1591,14 @@ private:
// tokenize the input if it's set by CLI, return false on error
bool tokenize_cli_input(server_task & task) {
GGML_ASSERT(task.cli_input != nullptr);
try {
auto & opt = oai_parser_opt;
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
inputs.tools = {}; // TODO
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
inputs.json_schema = ""; // TODO
inputs.grammar = ""; // TODO
inputs.use_jinja = opt.use_jinja;
inputs.parallel_tool_calls = false;
inputs.add_generation_prompt = true;
inputs.reasoning_format = opt.reasoning_format;
inputs.enable_thinking = opt.enable_thinking;
// Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
// tokenize the resulting prompt
auto & prompt = chat_params.prompt;
auto & prompt = task.cli_prompt;
if (mctx != nullptr) {
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
} else {
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
}
task.cli_input.clear();
task.cli_prompt.clear();
task.cli_files.clear();
} catch (const std::exception & e) {
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
@@ -1686,7 +1674,7 @@ private:
{
// special case: if input is provided via CLI, tokenize it first
// otherwise, no need to tokenize as it's already done inside the HTTP thread
if (task.cli_input != nullptr) {
if (task.cli) {
if (!tokenize_cli_input(task)) {
break;
}
@@ -2898,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
}
server_context_meta server_context::get_meta() const {
auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
auto bos_id = llama_vocab_bos(impl->vocab);
auto eos_id = llama_vocab_eos(impl->vocab);
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
@@ -2910,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
/* model_name */ impl->model_name,
/* model_path */ impl->params_base.model.path,
/* has_mtmd */ impl->mctx != nullptr,
/* has_inp_image */ impl->oai_parser_opt.allow_image,
/* has_inp_audio */ impl->oai_parser_opt.allow_audio,
/* has_inp_image */ impl->chat_params.allow_image,
/* has_inp_audio */ impl->chat_params.allow_audio,
/* json_webui_settings */ impl->json_webui_settings,
/* slot_n_ctx */ impl->get_slot_n_ctx(),
/* pooling_type */ llama_pooling_type(impl->ctx),
/* chat_template */ common_chat_templates_source(impl->chat_templates.get()),
/* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
/* chat_params */ impl->chat_params,
/* bos_token_str */ bos_token_str,
/* eos_token_str */ eos_token_str,
@@ -3199,8 +3184,8 @@ void server_routes::init_routes() {
// this endpoint can be accessed during sleeping
// the next LOC is to avoid someone accidentally use ctx_server
bool server_ctx; // do NOT delete this line
GGML_UNUSED(server_ctx);
bool ctx_server; // do NOT delete this line
GGML_UNUSED(ctx_server);
res->ok({{"status", "ok"}});
return res;
@@ -3390,8 +3375,8 @@ void server_routes::init_routes() {
// this endpoint can be accessed during sleeping
// the next LOC is to avoid someone accidentally use ctx_server
bool server_ctx; // do NOT delete this line
GGML_UNUSED(server_ctx);
bool ctx_server; // do NOT delete this line
GGML_UNUSED(ctx_server);
task_params tparams;
tparams.sampling = params.sampling;
@@ -3400,6 +3385,9 @@ void server_routes::init_routes() {
{ "n_ctx", meta->slot_n_ctx },
};
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
std::string tmpl_tools = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
json props = {
{ "default_generation_settings", default_generation_settings_for_props },
{ "total_slots", params.n_parallel },
@@ -3414,15 +3402,15 @@ void server_routes::init_routes() {
{ "endpoint_metrics", params.endpoint_metrics },
{ "webui", params.webui },
{ "webui_settings", meta->json_webui_settings },
{ "chat_template", meta->chat_template },
{ "chat_template", tmpl_default },
{ "bos_token", meta->bos_token_str },
{ "eos_token", meta->eos_token_str },
{ "build_info", meta->build_info },
{ "is_sleeping", queue_tasks.is_sleeping() },
};
if (params.use_jinja) {
if (!meta->chat_template_tool_use.empty()) {
props["chat_template_tool_use"] = meta->chat_template_tool_use;
if (!tmpl_tools.empty()) {
props["chat_template_tool_use"] = tmpl_tools;
}
}
res->ok(props);
@@ -3443,6 +3431,7 @@ void server_routes::init_routes() {
this->get_api_show = [this](const server_http_req &) {
auto res = create_response();
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
json data = {
{
"model_info", {
@@ -3451,7 +3440,7 @@ void server_routes::init_routes() {
},
{"modelfile", ""},
{"parameters", ""},
{"template", meta->chat_template},
{"template", tmpl_default},
{"details", {
{"parent_model", ""},
{"format", "gguf"},
@@ -3576,7 +3565,7 @@ void server_routes::init_routes() {
json body = json::parse(req.body);
json body_parsed = oaicompat_chat_params_parse(
body,
ctx_server.oai_parser_opt,
meta->chat_params,
files);
return handle_completions_impl(
req,
@@ -3592,7 +3581,7 @@ void server_routes::init_routes() {
json body = convert_anthropic_to_oai(json::parse(req.body));
json body_parsed = oaicompat_chat_params_parse(
body,
ctx_server.oai_parser_opt,
meta->chat_params,
files);
return handle_completions_impl(
req,
@@ -3608,7 +3597,7 @@ void server_routes::init_routes() {
json body = convert_anthropic_to_oai(json::parse(req.body));
json body_parsed = oaicompat_chat_params_parse(
body,
ctx_server.oai_parser_opt,
meta->chat_params,
files);
json prompt = body_parsed.at("prompt");
@@ -3624,7 +3613,7 @@ void server_routes::init_routes() {
json body = json::parse(req.body);
json data = oaicompat_chat_params_parse(
body,
ctx_server.oai_parser_opt,
meta->chat_params,
files);
res->ok({{ "prompt", std::move(data.at("prompt")) }});
return res;
@@ -3635,8 +3624,8 @@ void server_routes::init_routes() {
// this endpoint can be accessed during sleeping
// the next LOC is to avoid someone accidentally use ctx_server
bool server_ctx; // do NOT delete this line
GGML_UNUSED(server_ctx);
bool ctx_server; // do NOT delete this line
GGML_UNUSED(ctx_server);
json models = {
{"models", {
+2 -3
View File
@@ -20,9 +20,8 @@ struct server_context_meta {
int slot_n_ctx;
enum llama_pooling_type pooling_type;
// chat template
std::string chat_template;
std::string chat_template_tool_use;
// chat params
server_chat_params & chat_params;
// tokens
std::string bos_token_str;
+17 -17
View File
@@ -68,10 +68,10 @@ json task_params::to_json(bool only_metrics) const {
{"stream", stream},
{"n_probs", sampling.n_probs},
{"min_keep", sampling.min_keep},
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
@@ -127,10 +127,10 @@ json task_params::to_json(bool only_metrics) const {
{"grammar_lazy", sampling.grammar_lazy},
{"grammar_triggers", grammar_triggers},
{"preserved_tokens", sampling.preserved_tokens},
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
@@ -291,21 +291,21 @@ task_params server_task::params_from_json_cmpl(
{
auto it = data.find("chat_format");
if (it != data.end()) {
params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
params.chat_parser_params.format = static_cast<common_chat_format>(it->get<int>());
SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format));
} else {
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
params.chat_parser_params.format = defaults.chat_parser_params.format;
}
common_reasoning_format reasoning_format = params_base.reasoning_format;
if (data.contains("reasoning_format")) {
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
}
params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
params.chat_parser_params.reasoning_format = reasoning_format;
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.oaicompat_chat_syntax.parser.load(data.at("chat_parser").get<std::string>());
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
}
}
@@ -722,7 +722,7 @@ common_chat_msg task_result_state::update_chat_msg(
auto new_msg = common_chat_parse(
generated_text,
is_partial,
oaicompat_chat_syntax);
chat_parser_params);
if (!new_msg.empty()) {
new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
chat_msg = new_msg;
+11 -7
View File
@@ -78,7 +78,9 @@ struct task_params {
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
std::string oaicompat_model;
std::string oaicompat_cmpl_id;
common_chat_syntax oaicompat_chat_syntax;
// per-request parameters for chat parsing
common_chat_parser_params chat_parser_params;
// Embeddings
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
@@ -91,7 +93,7 @@ struct task_params {
struct task_result_state {
// tracking diffs for partial tool calls
std::vector<common_chat_msg_diff> diffs;
common_chat_syntax oaicompat_chat_syntax;
common_chat_parser_params chat_parser_params;
common_chat_msg chat_msg;
std::string generated_text; // append new chunks of generated text here
std::vector<std::string> generated_tool_call_ids;
@@ -100,8 +102,8 @@ struct task_result_state {
bool anthropic_thinking_block_started = false;
bool anthropic_text_block_started = false;
task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
: oaicompat_chat_syntax(oaicompat_chat_syntax) {}
task_result_state(const common_chat_parser_params & chat_parser_params)
: chat_parser_params(chat_parser_params) {}
// parse partial tool calls and update the internal state
common_chat_msg update_chat_msg(
@@ -130,8 +132,10 @@ struct server_task {
task_params params;
server_tokens tokens;
// only used by CLI, this delegates the tokenization to the server
json cli_input = nullptr;
// only used by CLI, this allow tokenizing CLI inputs on server side
// we need this because mtmd_context and vocab are not accessible outside of server_context
bool cli = false;
std::string cli_prompt;
std::vector<raw_buffer> cli_files;
server_task_type type;
@@ -228,7 +232,7 @@ struct server_task {
// the task will be moved into queue, then onto slots
// however, the state must be kept by caller (e.g., HTTP thread)
task_result_state create_state() const {
return task_result_state(params.oaicompat_chat_syntax);
return task_result_state(params.chat_parser_params);
}
bool is_parent() const {