mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d1e3556481 | |||
| 08f3f4a8a3 | |||
| 271191906c | |||
| 7dee9ff59a | |||
| 6df686bee6 | |||
| 1706a6d7c6 | |||
| 959ecf7f23 | |||
| 4037093c66 | |||
| 18361c579c | |||
| 365a3e8c31 | |||
| 3d55846a5c | |||
| 287a33017b | |||
| 293a1565dc | |||
| fe44d35574 | |||
| bbcdac0189 | |||
| d03c45c9c5 | |||
| 10c98cbdf6 | |||
| 420960ab92 | |||
| f55b033ae6 | |||
| d1b4757ded | |||
| 57c0beaed0 | |||
| 2fbde785bc |
+4
-1
@@ -89,7 +89,10 @@ nix:
|
||||
embedding:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/embedding/
|
||||
|
||||
jinja parser:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- common/jinja/**
|
||||
Ascend NPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
/common/common.* @ggerganov
|
||||
/common/console.* @ggerganov
|
||||
/common/http.* @angt
|
||||
/common/jinja/ @ngxson @CISC @aldehir
|
||||
/common/llguidance.* @ggerganov
|
||||
/common/log.* @ggerganov
|
||||
/common/peg-parser.* @aldehir
|
||||
|
||||
@@ -254,7 +254,7 @@ function gg_run_ctest_release {
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
else
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
fi
|
||||
|
||||
+7
-7
@@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
|
||||
return tmpls->has_explicit_template;
|
||||
}
|
||||
|
||||
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
|
||||
if (variant != nullptr) {
|
||||
if (strcmp(variant, "tool_use") == 0) {
|
||||
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
|
||||
if (!variant.empty()) {
|
||||
if (variant == "tool_use") {
|
||||
if (tmpls->template_tool_use) {
|
||||
return tmpls->template_tool_use->source().c_str();
|
||||
return tmpls->template_tool_use->source();
|
||||
}
|
||||
return nullptr;
|
||||
return "";
|
||||
} else {
|
||||
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
|
||||
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
|
||||
}
|
||||
}
|
||||
return tmpls->template_default->source().c_str();
|
||||
return tmpls->template_default->source();
|
||||
}
|
||||
|
||||
common_chat_templates_ptr common_chat_templates_init(
|
||||
|
||||
+1
-1
@@ -191,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init(
|
||||
const std::string & eos_token_override = "");
|
||||
|
||||
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
|
||||
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
|
||||
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
|
||||
|
||||
|
||||
struct common_chat_params common_chat_templates_apply(
|
||||
|
||||
+12
-7
@@ -91,6 +91,16 @@ lexer_result lexer::tokenize(const std::string & source) {
|
||||
return str;
|
||||
};
|
||||
|
||||
auto consume_numeric = [&]() -> std::string {
|
||||
std::string num = consume_while(is_integer);
|
||||
if (pos < src.size() && src[pos] == '.' && pos + 1 < src.size() && is_integer(src[pos + 1])) {
|
||||
++pos; // Consume '.'
|
||||
std::string frac = consume_while(is_integer);
|
||||
num += "." + frac;
|
||||
}
|
||||
return num;
|
||||
};
|
||||
|
||||
auto next_pos_is = [&](std::initializer_list<char> chars, size_t n = 1) -> bool {
|
||||
if (pos + n >= src.size()) return false;
|
||||
for (char c : chars) {
|
||||
@@ -258,7 +268,7 @@ lexer_result lexer::tokenize(const std::string & source) {
|
||||
++pos; // Consume the operator
|
||||
|
||||
// Check for numbers following the unary operator
|
||||
std::string num = consume_while(is_integer);
|
||||
std::string num = consume_numeric();
|
||||
std::string value = std::string(1, ch) + num;
|
||||
token::type t = num.empty() ? token::unary_operator : token::numeric_literal;
|
||||
// JJ_DEBUG("consumed unary operator or numeric literal: '%s'", value.c_str());
|
||||
@@ -307,12 +317,7 @@ lexer_result lexer::tokenize(const std::string & source) {
|
||||
// Numbers
|
||||
if (is_integer(ch)) {
|
||||
start_pos = pos;
|
||||
std::string num = consume_while(is_integer);
|
||||
if (pos < src.size() && src[pos] == '.' && pos + 1 < src.size() && is_integer(src[pos + 1])) {
|
||||
++pos; // Consume '.'
|
||||
std::string frac = consume_while(is_integer);
|
||||
num += "." + frac;
|
||||
}
|
||||
std::string num = consume_numeric();
|
||||
// JJ_DEBUG("consumed numeric literal: '%s'", num.c_str());
|
||||
tokens.push_back({token::numeric_literal, num, start_pos});
|
||||
continue;
|
||||
|
||||
+23
-11
@@ -268,8 +268,7 @@ value binary_expression::execute_impl(context & ctx) {
|
||||
// String in object
|
||||
if (is_val<value_string>(left_val) && is_val<value_object>(right_val)) {
|
||||
auto key = left_val->as_string().str();
|
||||
auto & obj = right_val->as_object();
|
||||
bool has_key = obj.find(key) != obj.end();
|
||||
bool has_key = right_val->has_key(key);
|
||||
if (op.value == "in") {
|
||||
return mk_val<value_bool>(has_key);
|
||||
} else if (op.value == "not in") {
|
||||
@@ -464,7 +463,7 @@ value for_statement::execute_impl(context & ctx) {
|
||||
std::vector<value> items;
|
||||
if (is_val<value_object>(iterable_val)) {
|
||||
JJ_DEBUG("%s", "For loop over object keys");
|
||||
auto & obj = iterable_val->as_object();
|
||||
auto & obj = iterable_val->as_ordered_object();
|
||||
for (auto & p : obj) {
|
||||
auto tuple = mk_val<value_array>();
|
||||
if (iterable_val->val_obj.is_key_numeric) {
|
||||
@@ -560,6 +559,7 @@ value for_statement::execute_impl(context & ctx) {
|
||||
for (size_t i = 0; i < filtered_items.size(); i++) {
|
||||
JJ_DEBUG("For loop iteration %zu/%zu", i + 1, filtered_items.size());
|
||||
value_object loop_obj = mk_val<value_object>();
|
||||
loop_obj->has_builtins = false; // loop object has no builtins
|
||||
loop_obj->insert("index", mk_val<value_int>(i + 1));
|
||||
loop_obj->insert("index0", mk_val<value_int>(i));
|
||||
loop_obj->insert("revindex", mk_val<value_int>(filtered_items.size() - i));
|
||||
@@ -717,6 +717,7 @@ value member_expression::execute_impl(context & ctx) {
|
||||
|
||||
value property;
|
||||
if (this->computed) {
|
||||
// syntax: obj[expr]
|
||||
JJ_DEBUG("Member expression, computing property type %s", this->property->type().c_str());
|
||||
|
||||
int64_t arr_size = 0;
|
||||
@@ -745,10 +746,24 @@ value member_expression::execute_impl(context & ctx) {
|
||||
property = this->property->execute(ctx);
|
||||
}
|
||||
} else {
|
||||
// syntax: obj.prop
|
||||
if (!is_stmt<identifier>(this->property)) {
|
||||
throw std::runtime_error("Non-computed member property must be an identifier");
|
||||
throw std::runtime_error("Static member property must be an identifier");
|
||||
}
|
||||
property = mk_val<value_string>(cast_stmt<identifier>(this->property)->val);
|
||||
std::string prop = property->as_string().str();
|
||||
JJ_DEBUG("Member expression, object type %s, static property '%s'", object->type().c_str(), prop.c_str());
|
||||
|
||||
// behavior of jinja2: obj having prop as a built-in function AND 'prop', as an object key,
|
||||
// then obj.prop returns the built-in function, not the property value.
|
||||
// while obj['prop'] returns the property value.
|
||||
// example: {"obj": {"items": 123}} -> obj.items is the built-in function, obj['items'] is 123
|
||||
|
||||
value val = try_builtin_func(ctx, prop, object, true);
|
||||
if (!is_val<value_undefined>(val)) {
|
||||
return val;
|
||||
}
|
||||
// else, fallthrough to normal property access below
|
||||
}
|
||||
|
||||
JJ_DEBUG("Member expression on object type %s, property type %s", object->type().c_str(), property->type().c_str());
|
||||
@@ -763,11 +778,8 @@ value member_expression::execute_impl(context & ctx) {
|
||||
throw std::runtime_error("Cannot access object with non-string: got " + property->type());
|
||||
}
|
||||
auto key = property->as_string().str();
|
||||
auto & obj = object->as_object();
|
||||
auto it = obj.find(key);
|
||||
if (it != obj.end()) {
|
||||
val = it->second;
|
||||
} else {
|
||||
val = object->at(key, val);
|
||||
if (is_val<value_undefined>(val)) {
|
||||
val = try_builtin_func(ctx, key, object, true);
|
||||
}
|
||||
JJ_DEBUG("Accessed property '%s' value, got type: %s", key.c_str(), val->type().c_str());
|
||||
@@ -793,7 +805,7 @@ value member_expression::execute_impl(context & ctx) {
|
||||
} else if (is_val<value_string>(property)) {
|
||||
auto key = property->as_string().str();
|
||||
JJ_DEBUG("Accessing %s built-in '%s'", is_val<value_array>(object) ? "array" : "string", key.c_str());
|
||||
val = try_builtin_func(ctx, key, object);
|
||||
val = try_builtin_func(ctx, key, object, true);
|
||||
} else {
|
||||
throw std::runtime_error("Cannot access property with non-string/non-number: got " + property->type());
|
||||
}
|
||||
@@ -802,7 +814,7 @@ value member_expression::execute_impl(context & ctx) {
|
||||
throw std::runtime_error("Cannot access property with non-string: got " + property->type());
|
||||
}
|
||||
auto key = property->as_string().str();
|
||||
val = try_builtin_func(ctx, key, object);
|
||||
val = try_builtin_func(ctx, key, object, true);
|
||||
}
|
||||
|
||||
if (ctx.is_get_stats && val && object && property) {
|
||||
|
||||
@@ -56,6 +56,7 @@ struct context {
|
||||
// src is optional, used for error reporting
|
||||
context(std::string src = "") : src(std::make_shared<std::string>(std::move(src))) {
|
||||
env = mk_val<value_object>();
|
||||
env->has_builtins = false; // context object has no builtins
|
||||
env->insert("true", mk_val<value_bool>(true));
|
||||
env->insert("True", mk_val<value_bool>(true));
|
||||
env->insert("false", mk_val<value_bool>(false));
|
||||
@@ -68,7 +69,7 @@ struct context {
|
||||
|
||||
context(const context & parent) : context() {
|
||||
// inherit variables (for example, when entering a new scope)
|
||||
auto & pvar = parent.env->as_object();
|
||||
auto & pvar = parent.env->as_ordered_object();
|
||||
for (const auto & pair : pvar) {
|
||||
set_val(pair.first, pair.second);
|
||||
}
|
||||
@@ -265,7 +266,7 @@ struct comment_statement : public statement {
|
||||
struct member_expression : public expression {
|
||||
statement_ptr object;
|
||||
statement_ptr property;
|
||||
bool computed;
|
||||
bool computed; // true if obj[expr] and false if obj.prop
|
||||
|
||||
member_expression(statement_ptr && object, statement_ptr && property, bool computed)
|
||||
: object(std::move(object)), property(std::move(property)), computed(computed) {
|
||||
|
||||
+73
-54
@@ -698,6 +698,7 @@ const func_builtins & value_bool_t::get_builtins() const {
|
||||
bool val = args.get_pos(0)->as_bool();
|
||||
return mk_val<value_string>(val ? "True" : "False");
|
||||
}},
|
||||
{"tojson", tojson},
|
||||
};
|
||||
return builtins;
|
||||
}
|
||||
@@ -775,19 +776,30 @@ const func_builtins & value_array_t::get_builtins() const {
|
||||
if (!is_val<value_array>(args.get_pos(0))) {
|
||||
throw raised_exception("join() first argument must be an array");
|
||||
}
|
||||
value val_delim = args.get_kwarg_or_pos("d", 1);
|
||||
value val_attribute = args.get_kwarg_or_pos("attribute", 2);
|
||||
if (!val_attribute->is_undefined()) {
|
||||
throw not_implemented_exception("array attribute join not implemented");
|
||||
}
|
||||
value val_delim = args.get_kwarg_or_pos("d", 1);
|
||||
value attribute = args.get_kwarg_or_pos("attribute", 2);
|
||||
const auto & arr = args.get_pos(0)->as_array();
|
||||
std::string delim = is_val<value_string>(val_delim) ? val_delim->as_string().str() : "";
|
||||
const bool attr_is_int = is_val<value_int>(attribute);
|
||||
if (!attribute->is_undefined() && !is_val<value_string>(attribute) && !attr_is_int) {
|
||||
throw raised_exception("join() attribute must be string or integer");
|
||||
}
|
||||
const int64_t attr_int = attr_is_int ? attribute->as_int() : 0;
|
||||
const std::string delim = val_delim->is_undefined() ? "" : val_delim->as_string().str();
|
||||
const std::string attr_name = attribute->is_undefined() ? "" : attribute->as_string().str();
|
||||
std::string result;
|
||||
for (size_t i = 0; i < arr.size(); ++i) {
|
||||
if (!is_val<value_string>(arr[i]) && !is_val<value_int>(arr[i]) && !is_val<value_float>(arr[i])) {
|
||||
value val_arr = arr[i];
|
||||
if (!attribute->is_undefined()) {
|
||||
if (attr_is_int && is_val<value_array>(val_arr)) {
|
||||
val_arr = val_arr->at(attr_int);
|
||||
} else if (!attr_is_int && !attr_name.empty() && is_val<value_object>(val_arr)) {
|
||||
val_arr = val_arr->at(attr_name);
|
||||
}
|
||||
}
|
||||
if (!is_val<value_string>(val_arr) && !is_val<value_int>(val_arr) && !is_val<value_float>(val_arr)) {
|
||||
throw raised_exception("join() can only join arrays of strings or numerics");
|
||||
}
|
||||
result += arr[i]->as_string().str();
|
||||
result += val_arr->as_string().str();
|
||||
if (i < arr.size() - 1) {
|
||||
result += delim;
|
||||
}
|
||||
@@ -802,26 +814,30 @@ const func_builtins & value_array_t::get_builtins() const {
|
||||
}},
|
||||
{"tojson", tojson},
|
||||
{"map", [](const func_args & args) -> value {
|
||||
args.ensure_count(2, 3);
|
||||
args.ensure_count(2);
|
||||
if (!is_val<value_array>(args.get_pos(0))) {
|
||||
throw raised_exception("map: first argument must be an array");
|
||||
}
|
||||
value attribute = args.get_kwarg_or_pos("attribute", 1);
|
||||
if (is_val<value_int>(attribute)) {
|
||||
throw not_implemented_exception("map: integer attribute not implemented");
|
||||
if (!is_val<value_kwarg>(args.get_args().at(1))) {
|
||||
throw not_implemented_exception("map: filter-mapping not implemented");
|
||||
}
|
||||
if (!is_val<value_string>(attribute)) {
|
||||
value attribute = args.get_kwarg_or_pos("attribute", 1);
|
||||
const bool attr_is_int = is_val<value_int>(attribute);
|
||||
if (!is_val<value_string>(attribute) && !attr_is_int) {
|
||||
throw raised_exception("map: attribute must be string or integer");
|
||||
}
|
||||
std::string attr_name = attribute->as_string().str();
|
||||
const int64_t attr_int = attr_is_int ? attribute->as_int() : 0;
|
||||
const std::string attr_name = attribute->as_string().str();
|
||||
value default_val = args.get_kwarg("default", mk_val<value_undefined>());
|
||||
auto out = mk_val<value_array>();
|
||||
auto arr = args.get_pos(0)->as_array();
|
||||
for (const auto & item : arr) {
|
||||
if (!is_val<value_object>(item)) {
|
||||
throw raised_exception("map: item is not an object");
|
||||
value attr_val;
|
||||
if (attr_is_int) {
|
||||
attr_val = is_val<value_array>(item) ? item->at(attr_int, default_val) : default_val;
|
||||
} else {
|
||||
attr_val = is_val<value_object>(item) ? item->at(attr_name, default_val) : default_val;
|
||||
}
|
||||
value attr_val = item->at(attr_name, default_val);
|
||||
out->push_back(attr_val);
|
||||
}
|
||||
return out;
|
||||
@@ -847,29 +863,35 @@ const func_builtins & value_array_t::get_builtins() const {
|
||||
return arr_editable->pop_at(index);
|
||||
}},
|
||||
{"sort", [](const func_args & args) -> value {
|
||||
args.ensure_count(1, 3);
|
||||
args.ensure_count(1, 4);
|
||||
if (!is_val<value_array>(args.get_pos(0))) {
|
||||
throw raised_exception("sort: first argument must be an array");
|
||||
}
|
||||
bool reverse = args.get_kwarg("reverse", mk_val<value_undefined>())->as_bool();
|
||||
value attribute = args.get_kwarg("attribute", mk_val<value_undefined>());
|
||||
std::string attr = attribute->is_undefined() ? "" : attribute->as_string().str();
|
||||
value val_reverse = args.get_kwarg_or_pos("reverse", 1);
|
||||
value val_case = args.get_kwarg_or_pos("case_sensitive", 2);
|
||||
value attribute = args.get_kwarg_or_pos("attribute", 3);
|
||||
// FIXME: sorting is currently always case sensitive
|
||||
//const bool case_sensitive = val_case->as_bool(); // undefined == false
|
||||
const bool reverse = val_reverse->as_bool(); // undefined == false
|
||||
const bool attr_is_int = is_val<value_int>(attribute);
|
||||
const int64_t attr_int = attr_is_int ? attribute->as_int() : 0;
|
||||
const std::string attr_name = attribute->is_undefined() ? "" : attribute->as_string().str();
|
||||
std::vector<value> arr = cast_val<value_array>(args.get_pos(0))->as_array(); // copy
|
||||
std::sort(arr.begin(), arr.end(),[&](const value & a, const value & b) {
|
||||
value val_a = a;
|
||||
value val_b = b;
|
||||
if (!attribute->is_undefined()) {
|
||||
if (!is_val<value_object>(a) || !is_val<value_object>(b)) {
|
||||
throw raised_exception("sort: items are not objects");
|
||||
if (attr_is_int && is_val<value_array>(a) && is_val<value_array>(b)) {
|
||||
val_a = a->at(attr_int);
|
||||
val_b = b->at(attr_int);
|
||||
} else if (!attr_is_int && !attr_name.empty() && is_val<value_object>(a) && is_val<value_object>(b)) {
|
||||
val_a = a->at(attr_name);
|
||||
val_b = b->at(attr_name);
|
||||
} else {
|
||||
throw raised_exception("sort: unsupported object attribute comparison");
|
||||
}
|
||||
val_a = attr.empty() ? a : a->at(attr);
|
||||
val_b = attr.empty() ? b : b->at(attr);
|
||||
}
|
||||
if (reverse) {
|
||||
return value_compare(val_a, val_b, value_compare_op::gt);
|
||||
} else {
|
||||
return !value_compare(val_a, val_b, value_compare_op::gt);
|
||||
}
|
||||
return value_compare(val_a, val_b, reverse ? value_compare_op::gt : value_compare_op::lt);
|
||||
});
|
||||
return mk_val<value_array>(arr);
|
||||
}},
|
||||
@@ -888,6 +910,11 @@ const func_builtins & value_array_t::get_builtins() const {
|
||||
|
||||
|
||||
const func_builtins & value_object_t::get_builtins() const {
|
||||
if (!has_builtins) {
|
||||
static const func_builtins no_builtins = {};
|
||||
return no_builtins;
|
||||
}
|
||||
|
||||
static const func_builtins builtins = {
|
||||
// {"default", default_value}, // cause issue with gpt-oss
|
||||
{"get", [](const func_args & args) -> value {
|
||||
@@ -902,18 +929,13 @@ const func_builtins & value_object_t::get_builtins() const {
|
||||
if (args.count() == 3) {
|
||||
default_val = args.get_pos(2);
|
||||
}
|
||||
const auto & obj = args.get_pos(0)->as_object();
|
||||
const value obj = args.get_pos(0);
|
||||
std::string key = args.get_pos(1)->as_string().str();
|
||||
auto it = obj.find(key);
|
||||
if (it != obj.end()) {
|
||||
return it->second;
|
||||
} else {
|
||||
return default_val;
|
||||
}
|
||||
return obj->at(key, default_val);
|
||||
}},
|
||||
{"keys", [](const func_args & args) -> value {
|
||||
args.ensure_vals<value_object>();
|
||||
const auto & obj = args.get_pos(0)->as_object();
|
||||
const auto & obj = args.get_pos(0)->as_ordered_object();
|
||||
auto result = mk_val<value_array>();
|
||||
for (const auto & pair : obj) {
|
||||
result->push_back(mk_val<value_string>(pair.first));
|
||||
@@ -922,7 +944,7 @@ const func_builtins & value_object_t::get_builtins() const {
|
||||
}},
|
||||
{"values", [](const func_args & args) -> value {
|
||||
args.ensure_vals<value_object>();
|
||||
const auto & obj = args.get_pos(0)->as_object();
|
||||
const auto & obj = args.get_pos(0)->as_ordered_object();
|
||||
auto result = mk_val<value_array>();
|
||||
for (const auto & pair : obj) {
|
||||
result->push_back(pair.second);
|
||||
@@ -931,7 +953,7 @@ const func_builtins & value_object_t::get_builtins() const {
|
||||
}},
|
||||
{"items", [](const func_args & args) -> value {
|
||||
args.ensure_vals<value_object>();
|
||||
const auto & obj = args.get_pos(0)->as_object();
|
||||
const auto & obj = args.get_pos(0)->as_ordered_object();
|
||||
auto result = mk_val<value_array>();
|
||||
for (const auto & pair : obj) {
|
||||
auto item = mk_val<value_array>();
|
||||
@@ -945,7 +967,7 @@ const func_builtins & value_object_t::get_builtins() const {
|
||||
{"string", tojson},
|
||||
{"length", [](const func_args & args) -> value {
|
||||
args.ensure_vals<value_object>();
|
||||
const auto & obj = args.get_pos(0)->as_object();
|
||||
const auto & obj = args.get_pos(0)->as_ordered_object();
|
||||
return mk_val<value_int>(static_cast<int64_t>(obj.size()));
|
||||
}},
|
||||
{"tojson", [](const func_args & args) -> value {
|
||||
@@ -958,21 +980,18 @@ const func_builtins & value_object_t::get_builtins() const {
|
||||
value val_case = args.get_kwarg_or_pos("case_sensitive", 1);
|
||||
value val_by = args.get_kwarg_or_pos("by", 2);
|
||||
value val_reverse = args.get_kwarg_or_pos("reverse", 3);
|
||||
// FIXME: sorting is case sensitive
|
||||
// FIXME: sorting is currently always case sensitive
|
||||
//const bool case_sensitive = val_case->as_bool(); // undefined == false
|
||||
const bool reverse = val_reverse->as_bool(); // undefined == false
|
||||
if (!val_by->is_undefined()) {
|
||||
throw not_implemented_exception("dictsort by key not implemented");
|
||||
}
|
||||
if (reverse) {
|
||||
throw not_implemented_exception("dictsort reverse not implemented");
|
||||
}
|
||||
value_t::map obj = val_input->val_obj; // copy
|
||||
std::sort(obj.ordered.begin(), obj.ordered.end(), [&](const auto & a, const auto & b) {
|
||||
return a.first < b.first;
|
||||
const bool by_value = is_val<value_string>(val_by) && val_by->as_string().str() == "value" ? true : false;
|
||||
auto result = mk_val<value_object>(val_input); // copy
|
||||
std::sort(result->val_obj.ordered.begin(), result->val_obj.ordered.end(), [&](const auto & a, const auto & b) {
|
||||
if (by_value) {
|
||||
return value_compare(a.second, b.second, reverse ? value_compare_op::gt : value_compare_op::lt);
|
||||
} else {
|
||||
return reverse ? a.first > b.first : a.first < b.first;
|
||||
}
|
||||
});
|
||||
auto result = mk_val<value_object>();
|
||||
result->val_obj = std::move(obj);
|
||||
return result;
|
||||
}},
|
||||
{"join", [](const func_args &) -> value {
|
||||
@@ -1169,7 +1188,7 @@ static void value_to_json_internal(std::ostringstream & oss, const value & val,
|
||||
}
|
||||
oss << "]";
|
||||
} else if (is_val<value_object>(val)) {
|
||||
const auto & obj = val->val_obj.ordered; // IMPORTANT: need to keep exact order
|
||||
const auto & obj = val->as_ordered_object(); // IMPORTANT: need to keep exact order
|
||||
oss << "{";
|
||||
if (!obj.empty()) {
|
||||
oss << newline();
|
||||
|
||||
+31
-4
@@ -146,7 +146,7 @@ struct value_t {
|
||||
virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
|
||||
virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
|
||||
virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
|
||||
virtual const std::map<std::string, value> & as_object() const { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual const std::vector<std::pair<std::string, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
|
||||
virtual bool is_none() const { return false; }
|
||||
virtual bool is_undefined() const { return false; }
|
||||
@@ -154,6 +154,9 @@ struct value_t {
|
||||
throw std::runtime_error("No builtins available for type " + type());
|
||||
}
|
||||
|
||||
virtual bool has_key(const std::string & key) {
|
||||
return val_obj.unordered.find(key) != val_obj.unordered.end();
|
||||
}
|
||||
virtual value & at(const std::string & key, value & default_val) {
|
||||
auto it = val_obj.unordered.find(key);
|
||||
if (it == val_obj.unordered.end()) {
|
||||
@@ -168,8 +171,20 @@ struct value_t {
|
||||
}
|
||||
return val_obj.unordered.at(key);
|
||||
}
|
||||
virtual value & at(size_t index) {
|
||||
if (index >= val_arr.size()) {
|
||||
virtual value & at(int64_t index, value & default_val) {
|
||||
if (index < 0) {
|
||||
index += val_arr.size();
|
||||
}
|
||||
if (index < 0 || static_cast<size_t>(index) >= val_arr.size()) {
|
||||
return default_val;
|
||||
}
|
||||
return val_arr[index];
|
||||
}
|
||||
virtual value & at(int64_t index) {
|
||||
if (index < 0) {
|
||||
index += val_arr.size();
|
||||
}
|
||||
if (index < 0 || static_cast<size_t>(index) >= val_arr.size()) {
|
||||
throw std::runtime_error("Index " + std::to_string(index) + " out of bounds for array of size " + std::to_string(val_arr.size()));
|
||||
}
|
||||
return val_arr[index];
|
||||
@@ -188,6 +203,9 @@ struct value_int_t : public value_t {
|
||||
virtual int64_t as_int() const override { return val_int; }
|
||||
virtual double as_float() const override { return static_cast<double>(val_int); }
|
||||
virtual string as_string() const override { return std::to_string(val_int); }
|
||||
virtual bool as_bool() const override {
|
||||
return val_int != 0;
|
||||
}
|
||||
virtual const func_builtins & get_builtins() const override;
|
||||
};
|
||||
using value_int = std::shared_ptr<value_int_t>;
|
||||
@@ -204,6 +222,9 @@ struct value_float_t : public value_t {
|
||||
if (out.back() == '.') out.push_back('0'); // leave one zero if no decimals
|
||||
return out;
|
||||
}
|
||||
virtual bool as_bool() const override {
|
||||
return val_flt != 0.0;
|
||||
}
|
||||
virtual const func_builtins & get_builtins() const override;
|
||||
};
|
||||
using value_float = std::shared_ptr<value_float_t>;
|
||||
@@ -286,6 +307,7 @@ using value_array = std::shared_ptr<value_array_t>;
|
||||
|
||||
|
||||
struct value_object_t : public value_t {
|
||||
bool has_builtins = true; // context and loop objects do not have builtins
|
||||
value_object_t() = default;
|
||||
value_object_t(value & v) {
|
||||
val_obj = v->val_obj;
|
||||
@@ -295,11 +317,16 @@ struct value_object_t : public value_t {
|
||||
val_obj.insert(pair.first, pair.second);
|
||||
}
|
||||
}
|
||||
value_object_t(const std::vector<std::pair<std::string, value>> & obj) {
|
||||
for (const auto & pair : obj) {
|
||||
val_obj.insert(pair.first, pair.second);
|
||||
}
|
||||
}
|
||||
void insert(const std::string & key, const value & val) {
|
||||
val_obj.insert(key, val);
|
||||
}
|
||||
virtual std::string type() const override { return "Object"; }
|
||||
virtual const std::map<std::string, value> & as_object() const override { return val_obj.unordered; }
|
||||
virtual const std::vector<std::pair<std::string, value>> & as_ordered_object() const override { return val_obj.ordered; }
|
||||
virtual bool as_bool() const override {
|
||||
return !val_obj.unordered.empty();
|
||||
}
|
||||
|
||||
+31
-2
@@ -1078,6 +1078,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
|
||||
# ref: https://huggingface.co/aari1995/German_Semantic_V3
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
|
||||
# ref: https://huggingface.co/zai-org/GLM-4.7-Flash
|
||||
res = "glm4"
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
@@ -7458,7 +7461,7 @@ class DeepseekModel(TextModel):
|
||||
"DeepseekV3ForCausalLM",
|
||||
"KimiVLForConditionalGeneration",
|
||||
"YoutuForCausalLM",
|
||||
"YoutuVLForConditionalGeneration"
|
||||
"YoutuVLForConditionalGeneration",
|
||||
)
|
||||
class DeepseekV2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
@@ -8446,6 +8449,32 @@ class Glm4MoeModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("Glm4MoeLiteForCausalLM")
|
||||
class Glm4MoeLiteModel(DeepseekV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
|
||||
# copied from Glm4MoeModel
|
||||
def set_vocab(self):
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# Special tokens
|
||||
# Note: Using <|endoftext|> (151329) for eot causes endless generation
|
||||
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
|
||||
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
|
||||
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
|
||||
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
|
||||
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
||||
class ChatGLMModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.CHATGLM
|
||||
@@ -9183,7 +9212,7 @@ class NemotronHModel(GraniteHybridModel):
|
||||
return [(mapped_name, reshaped_data)]
|
||||
|
||||
if name.endswith("mixer.norm.weight"):
|
||||
reshaped_data = data_torch.reshape(8, 512)
|
||||
reshaped_data = data_torch.reshape(self.n_group, -1)
|
||||
mapped_name = self.map_tensor_name(name)
|
||||
return [(mapped_name, reshaped_data)]
|
||||
|
||||
|
||||
@@ -170,6 +170,7 @@ pre_computed_hashes = [
|
||||
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
|
||||
# jina-v2-de variants
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
- [CMake Options](#cmake-options)
|
||||
- [Android](#android)
|
||||
- [Windows 11 Arm64](#windows-11-arm64)
|
||||
- [Linux](#Linux)
|
||||
- [Known Issue](#known-issues)
|
||||
- [TODO](#todo)
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ set -e
|
||||
|
||||
# First try command line argument, then environment variable, then file
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
|
||||
BUILD_DIR="${2:-"$BUILD_DIR"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
@@ -13,6 +14,10 @@ if [ -z "$CONVERTED_MODEL" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
if [ -z "$BUILD_DIR" ]; then
|
||||
BUILD_DIR="../../build"
|
||||
fi
|
||||
|
||||
../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
|
||||
cmake --build ${BUILD_DIR} --target llama-debug -j8
|
||||
|
||||
${BUILD_DIR}/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
|
||||
|
||||
@@ -5,11 +5,16 @@ set -e
|
||||
# First try command line argument, then environment variable, then file
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
|
||||
MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
|
||||
BUILD_DIR="${3:-"$BUILD_DIR"}"
|
||||
|
||||
if [ -z "$MODEL_TESTING_PROMPT"]; then
|
||||
if [ -z "$MODEL_TESTING_PROMPT" ]; then
|
||||
MODEL_TESTING_PROMPT="Hello, my name is"
|
||||
fi
|
||||
|
||||
if [ -z "$BUILD_DIR" ]; then
|
||||
BUILD_DIR="../../build"
|
||||
fi
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
@@ -21,6 +26,6 @@ fi
|
||||
echo $CONVERTED_MODEL
|
||||
echo $MODEL_TESTING_PROMPT
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
cmake --build ${BUILD_DIR} --target llama-debug -j8
|
||||
|
||||
../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
|
||||
${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
|
||||
|
||||
@@ -28,6 +28,7 @@ done
|
||||
|
||||
# First try command line argument, then environment variable
|
||||
CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||
BUILD_DIR="${BUILD_DIR:-"../../build"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
@@ -50,5 +51,5 @@ fi
|
||||
|
||||
echo $CONVERTED_MODEL
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
|
||||
cmake --build ${BUILD_DIR} --target llama-debug -j8
|
||||
${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
|
||||
|
||||
+39
-7
@@ -630,10 +630,11 @@ extern "C" {
|
||||
|
||||
// this tensor...
|
||||
enum ggml_tensor_flag {
|
||||
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
|
||||
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
|
||||
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
|
||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
|
||||
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
|
||||
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
|
||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||
GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
|
||||
};
|
||||
|
||||
enum ggml_tri_type {
|
||||
@@ -2577,11 +2578,42 @@ extern "C" {
|
||||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * sgd_params); // alpha, weight decay
|
||||
|
||||
// build forward mutiple tensors and select one of them for computing
|
||||
// this is useful for creating graphs that have constant topology but compute different things based on the input
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
|
||||
//
|
||||
// automatic differentiation
|
||||
// nodes:
|
||||
// | - build forward into the graph but do not compute
|
||||
// c - build forward into the graph and compute
|
||||
//
|
||||
// | | ... c ... |
|
||||
// | | ... c ... |
|
||||
// | | ... c ... |
|
||||
// [0 1 ... idx ... n-1] <-- ggml_build_forward_select(..., n, idx)
|
||||
// c
|
||||
// c
|
||||
//
|
||||
// example:
|
||||
// struct ggml_tensor * curs[3];
|
||||
//
|
||||
// curs[0] = compute0(...);
|
||||
// curs[1] = compute1(...);
|
||||
// curs[2] = compute2(...);
|
||||
//
|
||||
// int idx = select_branch(some_input);
|
||||
//
|
||||
// struct ggml_tensor * out = ggml_build_forward_select(cgraph, curs, 3, idx);
|
||||
//
|
||||
GGML_API struct ggml_tensor * ggml_build_forward_select(
|
||||
struct ggml_cgraph * cgraph,
|
||||
struct ggml_tensor ** tensors,
|
||||
int n_tensors,
|
||||
int idx);
|
||||
|
||||
GGML_API void ggml_build_forward_expand(
|
||||
struct ggml_cgraph * cgraph,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(
|
||||
struct ggml_context * ctx, // context for gradient computation
|
||||
struct ggml_cgraph * cgraph,
|
||||
@@ -2613,7 +2645,7 @@ extern "C" {
|
||||
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||
|
||||
// dump the graph into a file using the dot format
|
||||
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
||||
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename);
|
||||
|
||||
// TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
|
||||
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
@@ -77,39 +77,23 @@
|
||||
#include "ggml-zendnn.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static std::string path_str(const fs::path & path) {
|
||||
std::string u8path;
|
||||
try {
|
||||
#if defined(__cpp_lib_char8_t)
|
||||
// C++20 and later: u8string() returns std::u8string
|
||||
std::u8string u8str = path.u8string();
|
||||
u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
|
||||
const std::u8string u8str = path.u8string();
|
||||
return std::string(reinterpret_cast<const char *>(u8str.data()), u8str.size());
|
||||
#else
|
||||
// C++17: u8string() returns std::string
|
||||
u8path = path.u8string();
|
||||
return path.u8string();
|
||||
#endif
|
||||
} catch (...) {
|
||||
return std::string();
|
||||
}
|
||||
return u8path;
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
|
||||
@@ -874,9 +874,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||
}
|
||||
if (sched->debug > 1) {
|
||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
|
||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
|
||||
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
|
||||
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
@@ -1922,6 +1922,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
||||
dst->view_offs = src->view_offs;
|
||||
}
|
||||
dst->op = src->op;
|
||||
dst->flags = src->flags;
|
||||
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
||||
ggml_set_name(dst, src->name);
|
||||
|
||||
|
||||
@@ -226,6 +226,10 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
ggml_backend_blas_mul_mat(ctx, node);
|
||||
|
||||
@@ -2146,6 +2146,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
|
||||
@@ -2943,6 +2943,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
|
||||
@@ -14,12 +14,6 @@ static __global__ void init_indices(int * indices, const int ncols, const int nr
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx <= nrows) {
|
||||
offsets[idx] = idx * ncols;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
@@ -31,18 +25,15 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
cudaStream_t stream) {
|
||||
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
|
||||
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
|
||||
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
|
||||
|
||||
int * temp_indices = temp_indices_alloc.get();
|
||||
float * temp_keys = temp_keys_alloc.get();
|
||||
int * d_offsets = offsets_alloc.get();
|
||||
|
||||
static const int block_size = 256;
|
||||
const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
|
||||
init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
|
||||
|
||||
const dim3 offset_grid((nrows + block_size - 1) / block_size);
|
||||
init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
|
||||
auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||
|
||||
@@ -57,7 +48,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols * nrows, nrows, // num items, num segments
|
||||
d_offsets, d_offsets + 1, stream);
|
||||
offset_iterator, offset_iterator + 1, stream);
|
||||
}
|
||||
} else {
|
||||
if (nrows == 1) {
|
||||
@@ -66,7 +57,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
|
||||
dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
|
||||
dst, ncols * nrows, nrows, offset_iterator, offset_iterator + 1,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,7 +72,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
|
||||
ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
|
||||
ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream);
|
||||
}
|
||||
} else {
|
||||
if (nrows == 1) {
|
||||
@@ -89,8 +81,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
|
||||
temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
|
||||
stream);
|
||||
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
|
||||
offset_iterator + 1, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1123,6 +1123,7 @@ struct ggml_tensor_extra_gpu {
|
||||
struct ggml_cuda_graph_node_properties {
|
||||
void * node_address;
|
||||
ggml_op node_op;
|
||||
int32_t flags;
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
size_t nb[GGML_MAX_DIMS];
|
||||
void * src_address[GGML_MAX_SRC];
|
||||
|
||||
@@ -2918,6 +2918,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
|
||||
props->node_address = node->data;
|
||||
props->node_op = node->op;
|
||||
props->flags = node->flags;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
props->ne[i] = node->ne[i];
|
||||
props->nb[i] = node->nb[i];
|
||||
@@ -2961,6 +2962,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
|
||||
return false;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3378,6 +3383,9 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// start of fusion operations
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
# include <cub/cub.cuh>
|
||||
# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
|
||||
# include <cuda/iterator>
|
||||
# define CUB_TOP_K_AVAILABLE
|
||||
using namespace cub;
|
||||
# endif // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2
|
||||
|
||||
@@ -2497,6 +2497,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t flags = 0;
|
||||
|
||||
// skip quantizer if src1 is reused
|
||||
|
||||
@@ -611,6 +611,9 @@ static inline bool ggml_can_fuse_ext(const struct ggml_cgraph * cgraph, const in
|
||||
if (node->op != ops[i]) {
|
||||
return false;
|
||||
}
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return false;
|
||||
}
|
||||
if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1078,12 +1078,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
op->src[0]->ne[0] != 112 &&
|
||||
op->src[0]->ne[0] != 128 &&
|
||||
op->src[0]->ne[0] != 192 &&
|
||||
op->src[0]->ne[0] != 256) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek sizes
|
||||
// TODO: disabled for now, until optmized
|
||||
op->src[0]->ne[0] != 256 &&
|
||||
op->src[0]->ne[0] != 576) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
|
||||
@@ -203,6 +203,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
GGML_ABORT("unsupported op");
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int n_fuse = 1;
|
||||
|
||||
// check if the current node can run concurrently with other nodes before it
|
||||
@@ -2516,7 +2520,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
// simdgroups per threadgroup (a.k.a. warps)
|
||||
//nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
|
||||
int32_t nsg = 4;
|
||||
int32_t nsg = ne00 >= 512 ? 8 : 4;
|
||||
|
||||
const size_t smem = FATTN_SMEM(nsg);
|
||||
|
||||
|
||||
@@ -5552,9 +5552,7 @@ void kernel_flash_attn_ext_impl(
|
||||
|
||||
constexpr short NC = (C/8)/NSG;
|
||||
|
||||
// note: do not unroll for large heads
|
||||
#pragma unroll (DK <= 64 ? NC : 1)
|
||||
for (short cc = 0; cc < NC; ++cc) {
|
||||
FOR_UNROLL (short cc = 0; cc < NC; ++cc) {
|
||||
qk8x8_t mqk = make_filled_simdgroup_matrix<qk_t, 8>((qk_t) 0.0f);
|
||||
|
||||
if (DK % 16 != 0) {
|
||||
@@ -5575,7 +5573,9 @@ void kernel_flash_attn_ext_impl(
|
||||
k8x8_t mk[2];
|
||||
q8x8_t mq[2];
|
||||
|
||||
FOR_UNROLL (short i = 0; i < DK8/2; ++i) {
|
||||
// note: too much unroll can tank the performance for large heads
|
||||
#pragma unroll (MIN(DK8/2, 4*NSG))
|
||||
for (short i = 0; i < DK8/2; ++i) {
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
simdgroup_load(mq[0], pq + 0*8 + 16*i, DK);
|
||||
@@ -5749,7 +5749,9 @@ void kernel_flash_attn_ext_impl(
|
||||
pv += 8*NS20;
|
||||
}
|
||||
} else {
|
||||
FOR_UNROLL (short cc = 0; cc < (C/8)/2; ++cc) {
|
||||
constexpr short NC = (C/8)/2;
|
||||
|
||||
FOR_UNROLL (short cc = 0; cc < NC; ++cc) {
|
||||
s8x8_t vs[2];
|
||||
|
||||
simdgroup_load(vs[0], ss + 16*cc + 0, SH, 0, false);
|
||||
@@ -5952,6 +5954,7 @@ kernel void kernel_flash_attn_ext(
|
||||
//case 1: kernel_flash_attn_ext_impl<FWD_TMPL, 1>(FWD_ARGS); break;
|
||||
//case 2: kernel_flash_attn_ext_impl<FWD_TMPL, 2>(FWD_ARGS); break;
|
||||
case 4: kernel_flash_attn_ext_impl<FWD_TMPL, 4>(FWD_ARGS); break;
|
||||
case 8: kernel_flash_attn_ext_impl<FWD_TMPL, 8>(FWD_ARGS); break;
|
||||
}
|
||||
#undef FWD_TMPL
|
||||
#undef FWD_ARGS
|
||||
|
||||
@@ -3058,6 +3058,10 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
|
||||
ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
|
||||
i += 2;
|
||||
|
||||
@@ -111,6 +111,10 @@ kernel void kernel_mul_mv_q6_K_f32(
|
||||
|
||||
int row = N_SIMDGROUP * r0 + get_sub_group_id();
|
||||
|
||||
if (row >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
|
||||
@@ -4109,6 +4109,9 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
|
||||
@@ -12191,6 +12191,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) {
|
||||
return false;
|
||||
}
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
||||
ctx->semaphore_idx = 0;
|
||||
@@ -13645,7 +13648,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
int last_node = cgraph->n_nodes - 1;
|
||||
|
||||
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
||||
while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
|
||||
while (last_node > 0 && (ggml_vk_is_empty(cgraph->nodes[last_node]) || ((cgraph->nodes[last_node]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0))) {
|
||||
last_node -= 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1982,6 +1982,9 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
|
||||
if (ggml_is_empty(node)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
|
||||
|
||||
ggml_tensor * src0 = node->src[0];
|
||||
|
||||
@@ -58,6 +58,10 @@ static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgr
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool ok = ggml_zdnn_compute_forward(ctx, node);
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: unsupported op %s (%s)\n",
|
||||
|
||||
@@ -211,6 +211,10 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
ggml_zendnn_compute_forward_mul_mat(ctx, node);
|
||||
|
||||
+52
-18
@@ -3441,7 +3441,8 @@ struct ggml_tensor * ggml_cast(
|
||||
|
||||
result->op = GGML_OP_CPY;
|
||||
result->src[0] = a;
|
||||
result->src[1] = result;
|
||||
result->src[1] = result; // note: this self-reference might seem redundant, but it's actually needed by some
|
||||
// backends for consistency with ggml_cpy_impl() above
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -6725,20 +6726,35 @@ static void ggml_compute_backward(
|
||||
GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
|
||||
}
|
||||
|
||||
static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
|
||||
// check if already visited
|
||||
size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
static size_t ggml_visit_parents_graph(struct ggml_cgraph * cgraph, struct ggml_tensor * node, bool compute) {
|
||||
if (node->op != GGML_OP_NONE && compute) {
|
||||
node->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
}
|
||||
|
||||
const size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL);
|
||||
if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
|
||||
// This is the first time we see this node in the current graph.
|
||||
cgraph->visited_hash_set.keys[node_hash_pos] = node;
|
||||
ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
|
||||
cgraph->use_counts[node_hash_pos] = 0;
|
||||
} else {
|
||||
|
||||
if (ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
|
||||
// already visited
|
||||
|
||||
if (compute) {
|
||||
// update the compute flag regardless
|
||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||
struct ggml_tensor * src = node->src[i];
|
||||
if (src && ((src->flags & GGML_TENSOR_FLAG_COMPUTE) == 0)) {
|
||||
ggml_visit_parents_graph(cgraph, src, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return node_hash_pos;
|
||||
}
|
||||
|
||||
// This is the first time we see this node in the current graph.
|
||||
cgraph->visited_hash_set.keys[node_hash_pos] = node;
|
||||
ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
|
||||
cgraph->use_counts[node_hash_pos] = 0;
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||
const int k =
|
||||
(cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
|
||||
@@ -6747,7 +6763,7 @@ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor
|
||||
|
||||
struct ggml_tensor * src = node->src[k];
|
||||
if (src) {
|
||||
size_t src_hash_pos = ggml_visit_parents(cgraph, src);
|
||||
const size_t src_hash_pos = ggml_visit_parents_graph(cgraph, src, compute);
|
||||
|
||||
// Update the use count for this operand.
|
||||
cgraph->use_counts[src_hash_pos]++;
|
||||
@@ -6778,17 +6794,17 @@ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor
|
||||
return node_hash_pos;
|
||||
}
|
||||
|
||||
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
|
||||
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand, bool compute) {
|
||||
if (!expand) {
|
||||
// TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
|
||||
ggml_graph_clear(cgraph);
|
||||
}
|
||||
|
||||
const int n0 = cgraph->n_nodes;
|
||||
const int n_old = cgraph->n_nodes;
|
||||
|
||||
ggml_visit_parents(cgraph, tensor);
|
||||
ggml_visit_parents_graph(cgraph, tensor, compute);
|
||||
|
||||
const int n_new = cgraph->n_nodes - n0;
|
||||
const int n_new = cgraph->n_nodes - n_old;
|
||||
GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
|
||||
|
||||
if (n_new > 0) {
|
||||
@@ -6797,8 +6813,22 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_build_forward_select(
|
||||
struct ggml_cgraph * cgraph,
|
||||
struct ggml_tensor ** tensors,
|
||||
int n_tensors,
|
||||
int idx) {
|
||||
GGML_ASSERT(idx >= 0 && idx < n_tensors);
|
||||
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
ggml_build_forward_impl(cgraph, tensors[i], true, i == idx ? true : false);
|
||||
}
|
||||
|
||||
return tensors[idx];
|
||||
}
|
||||
|
||||
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||
ggml_build_forward_impl(cgraph, tensor, true);
|
||||
ggml_build_forward_impl(cgraph, tensor, true, true);
|
||||
}
|
||||
|
||||
void ggml_build_backward_expand(
|
||||
@@ -7229,6 +7259,10 @@ bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
|
||||
return false;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ggml_node_list_find_tensor(cgraph, outputs, num_outputs, node) != -1) {
|
||||
continue;
|
||||
}
|
||||
@@ -7310,7 +7344,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
|
||||
label);
|
||||
}
|
||||
|
||||
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
||||
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename) {
|
||||
char color[16];
|
||||
|
||||
FILE * fp = ggml_fopen(filename, "w");
|
||||
@@ -7331,7 +7365,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||
snprintf(color, sizeof(color), "yellow");
|
||||
} else if (grad) {
|
||||
if (ggml_graph_find(gf, node)) {
|
||||
if (ggml_graph_find(cgraph, node)) {
|
||||
snprintf(color, sizeof(color), "green");
|
||||
} else {
|
||||
snprintf(color, sizeof(color), "lightblue");
|
||||
|
||||
@@ -200,42 +200,6 @@ uint32_t llama_hparams::n_layer_kv() const {
|
||||
return res;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
||||
assert(p0 >= 0 && p1 >= 0);
|
||||
|
||||
switch (swa_type) {
|
||||
case LLAMA_SWA_TYPE_NONE:
|
||||
{
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_STANDARD:
|
||||
{
|
||||
if (p1 - p0 >= (int32_t) n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_CHUNKED:
|
||||
{
|
||||
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
||||
|
||||
if (p0 < pos_chunk_start) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_SYMMETRIC:
|
||||
{
|
||||
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
||||
const int32_t pos_diff = p1 - p0;
|
||||
|
||||
// Mask if outside the symmetric window
|
||||
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool llama_hparams::use_mrope() const {
|
||||
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
||||
}
|
||||
|
||||
+38
-1
@@ -3,6 +3,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
|
||||
// bump if necessary
|
||||
#define LLAMA_MAX_LAYERS 512
|
||||
@@ -274,9 +275,45 @@ struct llama_hparams {
|
||||
uint32_t n_layer_kv() const;
|
||||
|
||||
// note that this function uses different SWA parameters from those in the hparams
|
||||
// note: inlined on purpose for performance reasons
|
||||
// TODO: think of a better place for this function
|
||||
// TODO: pack the SWA params in a struct?
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
||||
assert(p0 >= 0 && p1 >= 0);
|
||||
|
||||
switch (swa_type) {
|
||||
case LLAMA_SWA_TYPE_NONE:
|
||||
{
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_STANDARD:
|
||||
{
|
||||
if (p1 - p0 >= (int32_t) n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_CHUNKED:
|
||||
{
|
||||
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
||||
|
||||
if (p0 < pos_chunk_start) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_SYMMETRIC:
|
||||
{
|
||||
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
||||
const int32_t pos_diff = p1 - p0;
|
||||
|
||||
// Mask if outside the symmetric window
|
||||
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool use_mrope() const;
|
||||
};
|
||||
|
||||
+212
-70
@@ -852,7 +852,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
||||
const llama_seq_id seq_id_cell = cells.seq_get(idx);
|
||||
|
||||
// SWA mask
|
||||
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
can_use = true;
|
||||
}
|
||||
}
|
||||
@@ -1237,6 +1237,197 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
|
||||
}
|
||||
}
|
||||
|
||||
struct args_set_input_kq_mask {
|
||||
const llama_hparams & hparams;
|
||||
const llama_ubatch * ubatch;
|
||||
|
||||
const std::vector<llama_kv_cells> & v_cells;
|
||||
const std::vector<uint32_t> & seq_to_stream;
|
||||
|
||||
uint32_t n_swa;
|
||||
llama_swa_type swa_type;
|
||||
|
||||
int64_t n_kv;
|
||||
int64_t n_stream;
|
||||
int64_t n_tps;
|
||||
};
|
||||
|
||||
template<bool causal, bool swa, bool is_2d, bool alibi>
|
||||
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
||||
//const auto & hparams = args.hparams;
|
||||
const auto & ubatch = args.ubatch;
|
||||
|
||||
const auto & v_cells = args.v_cells;
|
||||
const auto & seq_to_stream = args.seq_to_stream;
|
||||
|
||||
const uint32_t n_swa = args.n_swa;
|
||||
const llama_swa_type swa_type = args.swa_type;
|
||||
|
||||
const int64_t n_kv = args.n_kv;
|
||||
const int64_t n_stream = args.n_stream;
|
||||
const int64_t n_tps = args.n_tps;
|
||||
|
||||
// the min position in the batch for each sequence
|
||||
llama_pos seq_pos_min[LLAMA_MAX_SEQ];
|
||||
std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
|
||||
|
||||
for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
||||
|
||||
seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
|
||||
}
|
||||
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
// bookeeping of the KQ mask cells that could change for other tokens of the same sequence
|
||||
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
|
||||
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
|
||||
|
||||
for (uint32_t ii = 0; ii < n_tps; ++ii) {
|
||||
const uint32_t i = s*n_tps + ii;
|
||||
|
||||
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
||||
|
||||
const auto & cells = v_cells.at(seq_to_stream[seq_id]);
|
||||
|
||||
llama_pos p0 = -1;
|
||||
const llama_pos p1 = ubatch->pos[i];
|
||||
|
||||
// for M-RoPE
|
||||
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
||||
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
||||
|
||||
const uint64_t idst = n_kv*i;
|
||||
|
||||
// for tokens of the same sequence, the mask is mostly the same, so we can reuse it
|
||||
// the only cells that could change are the ones that are with similar positions as the
|
||||
// ones in the batch (i.e. due to causal masking, SWA, etc.)
|
||||
// keep track of those cells and shortcut the loop to save time
|
||||
// note: this optimization is not compatible with Alibi position encoding
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18842
|
||||
bool prev = false;
|
||||
|
||||
auto & idxs = seq_idxs[seq_id];
|
||||
|
||||
if (!alibi) {
|
||||
if (seq_srct.find(seq_id) != seq_srct.end()) {
|
||||
const uint32_t srct = seq_srct[seq_id];
|
||||
|
||||
const uint64_t idst_prev = n_kv*srct;
|
||||
|
||||
std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
|
||||
|
||||
prev = true;
|
||||
} else {
|
||||
idxs.clear();
|
||||
idxs.reserve(ubatch->n_tokens + n_swa + 32);
|
||||
|
||||
seq_srct[seq_id] = i;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t jj = 0; jj < n_kv; ++jj) {
|
||||
uint32_t j = jj;
|
||||
|
||||
// we have an exiting mask for this sequence -> update just seq_idxs
|
||||
if (!alibi) {
|
||||
if (prev) {
|
||||
if (jj >= idxs.size()) {
|
||||
break;
|
||||
}
|
||||
|
||||
j = idxs[jj];
|
||||
}
|
||||
}
|
||||
|
||||
if (cells.is_empty(j)) {
|
||||
goto skip;
|
||||
}
|
||||
|
||||
// mask the token if not the same sequence
|
||||
if (!cells.seq_has(j, seq_id)) {
|
||||
goto skip;
|
||||
}
|
||||
|
||||
p0 = cells.pos_get(j);
|
||||
|
||||
if (!alibi) {
|
||||
if (!prev) {
|
||||
// record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
|
||||
if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
|
||||
idxs.push_back(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (causal) {
|
||||
// mask future tokens
|
||||
if (p0 > p1) {
|
||||
goto skip;
|
||||
}
|
||||
|
||||
// M-RoPE causal mask
|
||||
if (is_2d) {
|
||||
if (p0 == p1) {
|
||||
const auto & p0_ext = cells.ext_get(j);
|
||||
|
||||
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
|
||||
goto skip;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// apply SWA if any
|
||||
if (swa) {
|
||||
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
|
||||
goto skip;
|
||||
}
|
||||
}
|
||||
|
||||
if (alibi) {
|
||||
data[idst + j] = -std::abs(p0 - p1);
|
||||
} else {
|
||||
data[idst + j] = 0.0f;
|
||||
}
|
||||
|
||||
continue;
|
||||
skip:
|
||||
data[idst + j] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<bool causal, bool swa, bool is_2d>
|
||||
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
||||
const bool alibi = args.hparams.use_alibi;
|
||||
if (alibi) {
|
||||
set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
|
||||
} else {
|
||||
set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
|
||||
}
|
||||
}
|
||||
|
||||
template<bool causal, bool swa>
|
||||
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
||||
const bool is_2d = args.ubatch->is_pos_2d();
|
||||
if (is_2d) {
|
||||
set_input_kq_mask_impl<causal, swa, true> (args, data);
|
||||
} else {
|
||||
set_input_kq_mask_impl<causal, swa, false>(args, data);
|
||||
}
|
||||
}
|
||||
|
||||
template<bool causal>
|
||||
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
||||
const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
|
||||
if (swa) {
|
||||
set_input_kq_mask_impl<causal, true> (args, data);
|
||||
} else {
|
||||
set_input_kq_mask_impl<causal, false>(args, data);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
||||
const uint32_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
@@ -1251,74 +1442,29 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
||||
// n_tps == n_tokens_per_stream
|
||||
const int64_t n_tps = n_tokens/n_stream;
|
||||
|
||||
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
||||
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
||||
// Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
|
||||
// Causal mask:
|
||||
// xxx-------
|
||||
// xxxx------
|
||||
// xxxxx-----
|
||||
// Non-causal mask:
|
||||
// xxxxx-----
|
||||
// xxxxx-----
|
||||
// xxxxx-----
|
||||
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
||||
// TODO: optimize this section
|
||||
for (uint32_t h = 0; h < 1; ++h) {
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
for (uint32_t ii = 0; ii < n_tps; ++ii) {
|
||||
const uint32_t i = s*n_tps + ii;
|
||||
const args_set_input_kq_mask args = {
|
||||
/*.hparams =*/ hparams,
|
||||
/*.ubatch =*/ ubatch,
|
||||
/*.v_cells =*/ v_cells,
|
||||
/*.seq_to_stream =*/ seq_to_stream,
|
||||
/*.n_swa =*/ n_swa,
|
||||
/*.swa_type =*/ swa_type,
|
||||
/*.n_kv =*/ n_kv,
|
||||
/*.n_stream =*/ n_stream,
|
||||
/*.n_tps =*/ n_tps,
|
||||
};
|
||||
|
||||
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
||||
|
||||
const auto & cells = v_cells[seq_to_stream[seq_id]];
|
||||
|
||||
const llama_pos p1 = ubatch->pos[i];
|
||||
|
||||
// for M-RoPE
|
||||
const bool is_2d = ubatch->is_pos_2d();
|
||||
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
||||
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
||||
|
||||
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
|
||||
|
||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
||||
if (cells.is_empty(j)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// mask the token if not the same sequence
|
||||
if (!cells.seq_has(j, seq_id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_pos p0 = cells.pos_get(j);
|
||||
|
||||
// mask future tokens
|
||||
if (causal_attn && p0 > p1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// M-RoPE causal mask
|
||||
if (causal_attn && is_2d && p0 == p1) {
|
||||
const auto & p0_ext = cells.ext_get(j);
|
||||
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// apply SWA if any
|
||||
if (is_masked_swa(p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (causal_attn) {
|
||||
set_input_kq_mask_impl<true> (args, data);
|
||||
} else {
|
||||
set_input_kq_mask_impl<false>(args, data);
|
||||
}
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
||||
|
||||
//LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
|
||||
}
|
||||
|
||||
void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
||||
@@ -1483,10 +1629,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
||||
return gf;
|
||||
}
|
||||
|
||||
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
||||
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
|
||||
}
|
||||
|
||||
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
|
||||
@@ -257,8 +257,6 @@ private:
|
||||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
bool is_masked_swa(llama_pos p0, llama_pos p1) const;
|
||||
|
||||
ggml_tensor * build_rope_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
|
||||
+5
-1
@@ -265,7 +265,8 @@ struct llama_file::impl {
|
||||
continue; // Interrupted by signal, retry
|
||||
}
|
||||
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
||||
if (errno == EFAULT) {
|
||||
if (errno == EFAULT || errno == EINVAL) {
|
||||
LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
|
||||
auto curr_off = tell();
|
||||
close(fd);
|
||||
fd = -1;
|
||||
@@ -384,6 +385,9 @@ int llama_file::file_id() const {
|
||||
#ifdef _WIN32
|
||||
return _fileno(pimpl->fp);
|
||||
#else
|
||||
if (pimpl->fd != -1) {
|
||||
return pimpl->fd;
|
||||
}
|
||||
#if defined(fileno)
|
||||
return fileno(pimpl->fp);
|
||||
#else
|
||||
|
||||
@@ -539,12 +539,18 @@ llama_model_loader::llama_model_loader(
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
||||
|
||||
// Disable mmap in case Direct I/O is enabled and available
|
||||
if (use_direct_io && use_mmap) {
|
||||
use_mmap = false;
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
if (use_mmap && use_direct_io) {
|
||||
if (files.back()->has_direct_io()) {
|
||||
// Disable mmap, as DirectIO is available
|
||||
use_mmap = false;
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
} else {
|
||||
// Disable DirectIO and reopen file using std::fopen for mmap
|
||||
use_direct_io = false;
|
||||
files.pop_back();
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
|
||||
@@ -187,6 +187,7 @@ llama_build_and_test(test-chat-parser.cpp)
|
||||
llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
|
||||
llama_build_and_test(test-chat-template.cpp)
|
||||
llama_build_and_test(test-jinja.cpp)
|
||||
llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
|
||||
llama_build_and_test(test-json-partial.cpp)
|
||||
llama_build_and_test(test-log.cpp)
|
||||
llama_build_and_test(
|
||||
|
||||
+345
-9
@@ -4,6 +4,7 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <sheredom/subprocess.h>
|
||||
|
||||
#include "jinja/runtime.h"
|
||||
#include "jinja/parser.h"
|
||||
@@ -31,12 +32,24 @@ static void test_array_methods(testing & t);
|
||||
static void test_object_methods(testing & t);
|
||||
static void test_fuzzing(testing & t);
|
||||
|
||||
static bool g_python_mode = false;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
testing t(std::cout);
|
||||
t.verbose = true;
|
||||
|
||||
if (argc >= 2) {
|
||||
t.set_filter(argv[1]);
|
||||
// usage: test-jinja [-py] [filter_regex]
|
||||
// -py : enable python mode (use python jinja2 for rendering expected output)
|
||||
// only use this for cross-checking, not for correctness
|
||||
// note: the implementation of this flag is basic, only intented to be used by maintainers
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
if (arg == "-py") {
|
||||
g_python_mode = true;
|
||||
} else {
|
||||
t.set_filter(arg);
|
||||
}
|
||||
}
|
||||
|
||||
t.test("whitespace control", test_whitespace_control);
|
||||
@@ -53,7 +66,9 @@ int main(int argc, char *argv[]) {
|
||||
t.test("string methods", test_string_methods);
|
||||
t.test("array methods", test_array_methods);
|
||||
t.test("object methods", test_object_methods);
|
||||
t.test("fuzzing", test_fuzzing);
|
||||
if (!g_python_mode) {
|
||||
t.test("fuzzing", test_fuzzing);
|
||||
}
|
||||
|
||||
return t.summary();
|
||||
}
|
||||
@@ -176,6 +191,84 @@ static void test_conditionals(testing & t) {
|
||||
json::object(),
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is undefined falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
json::object(),
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is undefined attribute falsy",
|
||||
"{{ 'yes' if not y.x else 'no' }}",
|
||||
{{"y", true}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is undefined key falsy",
|
||||
"{{ 'yes' if not y['x'] else 'no' }}",
|
||||
{{"y", {{}}}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is empty array falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", json::array()}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is empty object falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", json::object()}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is empty string falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", ""}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is 0 falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", 0}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is 0.0 falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", 0.0}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is non-empty array truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", json::array({""})}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is non-empty object truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", {"x", false}}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is non-empty string truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", "0"}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is 1 truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", 1}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is 1.0 truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", 1.0}},
|
||||
"yes"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_loops(testing & t) {
|
||||
@@ -247,6 +340,12 @@ static void test_expressions(testing & t) {
|
||||
"Bob"
|
||||
);
|
||||
|
||||
test_template(t, "negative float (not dot notation)",
|
||||
"{{ -1.0 }}",
|
||||
json::object(),
|
||||
"-1.0"
|
||||
);
|
||||
|
||||
test_template(t, "bracket notation",
|
||||
"{{ user['name'] }}",
|
||||
{{"user", {{"name", "Bob"}}}},
|
||||
@@ -383,6 +482,32 @@ static void test_filters(testing & t) {
|
||||
"123"
|
||||
);
|
||||
|
||||
test_template(t, "sort reverse",
|
||||
"{% for i in items|sort(true) %}{{ i }}{% endfor %}",
|
||||
{{"items", json::array({3, 1, 2})}},
|
||||
"321"
|
||||
);
|
||||
|
||||
test_template(t, "sort with attribute",
|
||||
"{{ items|sort(attribute='name')|join(attribute='age') }}",
|
||||
{{"items", json::array({
|
||||
json({{"name", "c"}, {"age", 3}}),
|
||||
json({{"name", "a"}, {"age", 1}}),
|
||||
json({{"name", "b"}, {"age", 2}}),
|
||||
})}},
|
||||
"123"
|
||||
);
|
||||
|
||||
test_template(t, "sort with numeric attribute",
|
||||
"{{ items|sort(attribute=0)|join(attribute=1) }}",
|
||||
{{"items", json::array({
|
||||
json::array({3, "z"}),
|
||||
json::array({1, "x"}),
|
||||
json::array({2, "y"}),
|
||||
})}},
|
||||
"xyz"
|
||||
);
|
||||
|
||||
test_template(t, "join",
|
||||
"{{ items|join(', ') }}",
|
||||
{{"items", json::array({"a", "b", "c"})}},
|
||||
@@ -534,6 +659,66 @@ static void test_literals(testing & t) {
|
||||
json::object(),
|
||||
"1"
|
||||
);
|
||||
|
||||
test_template(t, "integer|abs",
|
||||
"{{ -42 | abs }}",
|
||||
json::object(),
|
||||
"42"
|
||||
);
|
||||
|
||||
test_template(t, "integer|float",
|
||||
"{{ 42 | float }}",
|
||||
json::object(),
|
||||
"42.0"
|
||||
);
|
||||
|
||||
test_template(t, "integer|tojson",
|
||||
"{{ 42 | tojson }}",
|
||||
json::object(),
|
||||
"42"
|
||||
);
|
||||
|
||||
test_template(t, "float|abs",
|
||||
"{{ -3.14 | abs }}",
|
||||
json::object(),
|
||||
"3.14"
|
||||
);
|
||||
|
||||
test_template(t, "float|int",
|
||||
"{{ 3.14 | int }}",
|
||||
json::object(),
|
||||
"3"
|
||||
);
|
||||
|
||||
test_template(t, "float|tojson",
|
||||
"{{ 3.14 | tojson }}",
|
||||
json::object(),
|
||||
"3.14"
|
||||
);
|
||||
|
||||
test_template(t, "string|tojson",
|
||||
"{{ 'hello' | tojson }}",
|
||||
json::object(),
|
||||
"\"hello\""
|
||||
);
|
||||
|
||||
test_template(t, "boolean|int",
|
||||
"{{ true | int }}",
|
||||
json::object(),
|
||||
"1"
|
||||
);
|
||||
|
||||
test_template(t, "boolean|float",
|
||||
"{{ true | float }}",
|
||||
json::object(),
|
||||
"1.0"
|
||||
);
|
||||
|
||||
test_template(t, "boolean|tojson",
|
||||
"{{ true | tojson }}",
|
||||
json::object(),
|
||||
"true"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_comments(testing & t) {
|
||||
@@ -934,7 +1119,17 @@ static void test_array_methods(testing & t) {
|
||||
);
|
||||
|
||||
test_template(t, "array|join attribute",
|
||||
"{{ arr|join(attribute=0) }}",
|
||||
"{{ arr|join(attribute='age') }}",
|
||||
{{"arr", json::array({
|
||||
json({{"name", "a"}, {"age", 1}}),
|
||||
json({{"name", "b"}, {"age", 2}}),
|
||||
json({{"name", "c"}, {"age", 3}}),
|
||||
})}},
|
||||
"123"
|
||||
);
|
||||
|
||||
test_template(t, "array|join numeric attribute",
|
||||
"{{ arr|join(attribute=-1) }}",
|
||||
{{"arr", json::array({json::array({1}), json::array({2}), json::array({3})})}},
|
||||
"123"
|
||||
);
|
||||
@@ -957,8 +1152,8 @@ static void test_array_methods(testing & t) {
|
||||
"a,b,c,d"
|
||||
);
|
||||
|
||||
test_template(t, "array.map() with attribute",
|
||||
"{% for v in arr.map('age') %}{{ v }} {% endfor %}",
|
||||
test_template(t, "array|map with attribute",
|
||||
"{% for v in arr|map(attribute='age') %}{{ v }} {% endfor %}",
|
||||
{{"arr", json::array({
|
||||
json({{"name", "a"}, {"age", 1}}),
|
||||
json({{"name", "b"}, {"age", 2}}),
|
||||
@@ -967,8 +1162,28 @@ static void test_array_methods(testing & t) {
|
||||
"1 2 3 "
|
||||
);
|
||||
|
||||
test_template(t, "array.map() with numeric attribute",
|
||||
"{% for v in arr.map(0) %}{{ v }} {% endfor %}",
|
||||
test_template(t, "array|map with attribute default",
|
||||
"{% for v in arr|map(attribute='age', default=3) %}{{ v }} {% endfor %}",
|
||||
{{"arr", json::array({
|
||||
json({{"name", "a"}, {"age", 1}}),
|
||||
json({{"name", "b"}, {"age", 2}}),
|
||||
json({{"name", "c"}}),
|
||||
})}},
|
||||
"1 2 3 "
|
||||
);
|
||||
|
||||
test_template(t, "array|map without attribute default",
|
||||
"{% for v in arr|map(attribute='age') %}{{ v }} {% endfor %}",
|
||||
{{"arr", json::array({
|
||||
json({{"name", "a"}, {"age", 1}}),
|
||||
json({{"name", "b"}, {"age", 2}}),
|
||||
json({{"name", "c"}}),
|
||||
})}},
|
||||
"1 2 "
|
||||
);
|
||||
|
||||
test_template(t, "array|map with numeric attribute",
|
||||
"{% for v in arr|map(attribute=0) %}{{ v }} {% endfor %}",
|
||||
{{"arr", json::array({
|
||||
json::array({10, "x"}),
|
||||
json::array({20, "y"}),
|
||||
@@ -977,6 +1192,22 @@ static void test_array_methods(testing & t) {
|
||||
"10 20 30 "
|
||||
);
|
||||
|
||||
test_template(t, "array|map with negative attribute",
|
||||
"{% for v in arr|map(attribute=-1) %}{{ v }} {% endfor %}",
|
||||
{{"arr", json::array({
|
||||
json::array({10, "x"}),
|
||||
json::array({20, "y"}),
|
||||
json::array({30, "z"}),
|
||||
})}},
|
||||
"x y z "
|
||||
);
|
||||
|
||||
test_template(t, "array|map with filter",
|
||||
"{{ arr|map('int')|sum }}",
|
||||
{{"arr", json::array({"1", "2", "3"})}},
|
||||
"6"
|
||||
);
|
||||
|
||||
// not used by any chat templates
|
||||
// test_template(t, "array.insert()",
|
||||
// "{% set _ = arr.insert(1, 'x') %}{{ arr|join(',') }}",
|
||||
@@ -1063,9 +1294,21 @@ static void test_object_methods(testing & t) {
|
||||
{{"obj", {{"items", json::array({1, 2, 3})}}}},
|
||||
"{\"items\": [1, 2, 3]}"
|
||||
);
|
||||
|
||||
test_template(t, "object attribute and key access",
|
||||
"{{ obj.keys()|join(',') }} vs {{ obj['keys'] }} vs {{ obj.test }}",
|
||||
{{"obj", {{"keys", "value"}, {"test", "attr_value"}}}},
|
||||
"keys,test vs value vs attr_value"
|
||||
);
|
||||
|
||||
test_template(t, "env should not have object methods",
|
||||
"{{ keys is undefined }} {{ obj.keys is defined }}",
|
||||
{{"obj", {{"a", "b"}}}},
|
||||
"True True"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_template(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
|
||||
static void test_template_cpp(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
|
||||
t.test(name, [&tmpl, &vars, &expect](testing & t) {
|
||||
jinja::lexer lexer;
|
||||
auto lexer_res = lexer.tokenize(tmpl);
|
||||
@@ -1098,6 +1341,99 @@ static void test_template(testing & t, const std::string & name, const std::stri
|
||||
});
|
||||
}
|
||||
|
||||
// keep this in-sync with https://github.com/huggingface/transformers/blob/main/src/transformers/utils/chat_template_utils.py
|
||||
// note: we use SandboxedEnvironment instead of ImmutableSandboxedEnvironment to allow usage of in-place array methods like append() and pop()
|
||||
static std::string py_script = R"(
|
||||
import jinja2
|
||||
import jinja2.ext as jinja2_ext
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from jinja2.sandbox import SandboxedEnvironment
|
||||
|
||||
tmpl = json.loads(sys.argv[1])
|
||||
vars_json = json.loads(sys.argv[2])
|
||||
|
||||
env = SandboxedEnvironment(
|
||||
trim_blocks=True,
|
||||
lstrip_blocks=True,
|
||||
extensions=[jinja2_ext.loopcontrols],
|
||||
)
|
||||
|
||||
def raise_exception(message):
|
||||
raise jinja2.exceptions.TemplateError(message)
|
||||
|
||||
env.filters["tojson"] = lambda x, ensure_ascii=False, indent=None, separators=None, sort_keys=False: json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys)
|
||||
env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)
|
||||
env.globals["raise_exception"] = raise_exception
|
||||
|
||||
template = env.from_string(tmpl)
|
||||
result = template.render(**vars_json)
|
||||
print(result, end='')
|
||||
)";
|
||||
|
||||
static void test_template_py(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
|
||||
t.test(name, [&tmpl, &vars, &expect](testing & t) {
|
||||
// Prepare arguments
|
||||
std::string tmpl_json = json(tmpl).dump();
|
||||
std::string vars_json = vars.dump();
|
||||
|
||||
#ifdef _WIN32
|
||||
const char * python_executable = "python.exe";
|
||||
#else
|
||||
const char * python_executable = "python3";
|
||||
#endif
|
||||
|
||||
const char * command_line[] = {python_executable, "-c", py_script.c_str(), tmpl_json.c_str(), vars_json.c_str(), NULL};
|
||||
|
||||
struct subprocess_s subprocess;
|
||||
int options = subprocess_option_combined_stdout_stderr
|
||||
| subprocess_option_no_window
|
||||
| subprocess_option_inherit_environment
|
||||
| subprocess_option_search_user_path;
|
||||
int result = subprocess_create(command_line, options, &subprocess);
|
||||
|
||||
if (result != 0) {
|
||||
t.log("Failed to create subprocess, error code: " + std::to_string(result));
|
||||
t.assert_true("subprocess creation", false);
|
||||
return;
|
||||
}
|
||||
|
||||
// Read output
|
||||
std::string output;
|
||||
char buffer[1024];
|
||||
FILE * p_stdout = subprocess_stdout(&subprocess);
|
||||
while (fgets(buffer, sizeof(buffer), p_stdout)) {
|
||||
output += buffer;
|
||||
}
|
||||
|
||||
int process_return;
|
||||
subprocess_join(&subprocess, &process_return);
|
||||
subprocess_destroy(&subprocess);
|
||||
|
||||
if (process_return != 0) {
|
||||
t.log("Python script failed with exit code: " + std::to_string(process_return));
|
||||
t.log("Output: " + output);
|
||||
t.assert_true("python execution", false);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!t.assert_true("Template render mismatch", expect == output)) {
|
||||
t.log("Template: " + json(tmpl).dump());
|
||||
t.log("Expected: " + json(expect).dump());
|
||||
t.log("Python : " + json(output).dump());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void test_template(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
|
||||
if (g_python_mode) {
|
||||
test_template_py(t, name, tmpl, vars, expect);
|
||||
} else {
|
||||
test_template_cpp(t, name, tmpl, vars, expect);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// fuzz tests to ensure no crashes occur on malformed inputs
|
||||
//
|
||||
|
||||
+27
-5
@@ -71,14 +71,16 @@ struct cli_context {
|
||||
|
||||
std::string generate_completion(result_timings & out_timings) {
|
||||
server_response_reader rd = ctx_server.get_response_reader();
|
||||
auto formatted = format_chat();
|
||||
{
|
||||
// TODO: reduce some copies here in the future
|
||||
server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
|
||||
task.id = rd.get_new_id();
|
||||
task.index = 0;
|
||||
task.params = defaults; // copy
|
||||
task.cli_input = messages; // copy
|
||||
task.cli_files = input_files; // copy
|
||||
task.id = rd.get_new_id();
|
||||
task.index = 0;
|
||||
task.params = defaults; // copy
|
||||
task.cli_prompt = formatted.prompt; // copy
|
||||
task.cli_files = input_files; // copy
|
||||
task.cli = true;
|
||||
rd.post_task({std::move(task)});
|
||||
}
|
||||
|
||||
@@ -156,6 +158,26 @@ struct cli_context {
|
||||
return content;
|
||||
}
|
||||
}
|
||||
|
||||
common_chat_params format_chat() {
|
||||
auto meta = ctx_server.get_meta();
|
||||
auto & chat_params = meta.chat_params;
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
|
||||
inputs.tools = {}; // TODO
|
||||
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
inputs.json_schema = ""; // TODO
|
||||
inputs.grammar = ""; // TODO
|
||||
inputs.use_jinja = chat_params.use_jinja;
|
||||
inputs.parallel_tool_calls = false;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.reasoning_format = chat_params.reasoning_format;
|
||||
inputs.enable_thinking = chat_params.enable_thinking;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
@@ -831,7 +831,7 @@ static void handle_media(
|
||||
// used by /chat/completions endpoint
|
||||
json oaicompat_chat_params_parse(
|
||||
json & body, /* openai api json semantics */
|
||||
const oaicompat_parser_options & opt,
|
||||
const server_chat_params & opt,
|
||||
std::vector<raw_buffer> & out_files)
|
||||
{
|
||||
json llama_params;
|
||||
@@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse(
|
||||
}
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
|
||||
auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
|
||||
|
||||
/* Append assistant prefilled message */
|
||||
if (prefill_assistant_message) {
|
||||
|
||||
@@ -274,25 +274,25 @@ std::vector<server_tokens> tokenize_input_prompts(
|
||||
// OAI utils
|
||||
//
|
||||
|
||||
// used by /completions endpoint
|
||||
json oaicompat_completion_params_parse(const json & body);
|
||||
|
||||
struct oaicompat_parser_options {
|
||||
struct server_chat_params {
|
||||
bool use_jinja;
|
||||
bool prefill_assistant;
|
||||
common_reasoning_format reasoning_format;
|
||||
std::map<std::string,std::string> chat_template_kwargs;
|
||||
common_chat_templates * tmpls;
|
||||
std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
|
||||
common_chat_templates_ptr tmpls;
|
||||
bool allow_image;
|
||||
bool allow_audio;
|
||||
bool enable_thinking = true;
|
||||
std::string media_path;
|
||||
};
|
||||
|
||||
// used by /completions endpoint
|
||||
json oaicompat_completion_params_parse(const json & body);
|
||||
|
||||
// used by /chat/completions endpoint
|
||||
json oaicompat_chat_params_parse(
|
||||
json & body, /* openai api json semantics */
|
||||
const oaicompat_parser_options & opt,
|
||||
const server_chat_params & opt,
|
||||
std::vector<raw_buffer> & out_files);
|
||||
|
||||
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
||||
|
||||
@@ -534,8 +534,8 @@ public:
|
||||
server_queue queue_tasks;
|
||||
server_response queue_results;
|
||||
|
||||
common_chat_templates_ptr chat_templates;
|
||||
oaicompat_parser_options oai_parser_opt;
|
||||
// note: chat_params must not be refreshed upon existing sleeping state
|
||||
server_chat_params chat_params;
|
||||
|
||||
~server_context_impl() {
|
||||
if (!sleeping) {
|
||||
@@ -688,15 +688,6 @@ private:
|
||||
llama_init_dft->free_context();
|
||||
}
|
||||
|
||||
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
||||
try {
|
||||
common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
|
||||
} catch (const std::exception & e) {
|
||||
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
|
||||
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||
chat_templates = common_chat_templates_init(model, "chatml");
|
||||
}
|
||||
|
||||
std::string & mmproj_path = params_base.mmproj.path;
|
||||
if (!mmproj_path.empty()) {
|
||||
if (!is_resume) {
|
||||
@@ -845,30 +836,6 @@ private:
|
||||
model_name = model_path.filename().string();
|
||||
}
|
||||
|
||||
// thinking is enabled if:
|
||||
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
||||
// 2. The chat template supports it
|
||||
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
||||
SRV_INF("thinking = %d\n", enable_thinking);
|
||||
|
||||
oai_parser_opt = {
|
||||
/* use_jinja */ params_base.use_jinja,
|
||||
/* prefill_assistant */ params_base.prefill_assistant,
|
||||
/* reasoning_format */ params_base.reasoning_format,
|
||||
/* chat_template_kwargs */ params_base.default_template_kwargs,
|
||||
/* common_chat_templates */ chat_templates.get(),
|
||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||
/* enable_thinking */ enable_thinking,
|
||||
/* media_path */ params_base.media_path,
|
||||
};
|
||||
|
||||
// print sample chat example to make it clear which template is used
|
||||
// @ngxson modern templates are too long, spam the logs; printing the example is enough
|
||||
LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
|
||||
// common_chat_templates_source(chat_templates.get()),
|
||||
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
|
||||
|
||||
if (!is_resume) {
|
||||
return init();
|
||||
}
|
||||
@@ -907,6 +874,42 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
// populate chat template params
|
||||
{
|
||||
common_chat_templates_ptr chat_templates;
|
||||
|
||||
try {
|
||||
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
||||
|
||||
LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
|
||||
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
|
||||
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
|
||||
SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
|
||||
SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
// thinking is enabled if:
|
||||
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
||||
// 2. The chat template supports it
|
||||
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
||||
SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
|
||||
|
||||
chat_params = {
|
||||
/* use_jinja */ params_base.use_jinja,
|
||||
/* prefill_assistant */ params_base.prefill_assistant,
|
||||
/* reasoning_format */ params_base.reasoning_format,
|
||||
/* chat_template_kwargs */ params_base.default_template_kwargs,
|
||||
/* tmpls */ std::move(chat_templates),
|
||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||
/* enable_thinking */ enable_thinking,
|
||||
/* media_path */ params_base.media_path,
|
||||
};
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1326,11 +1329,12 @@ private:
|
||||
}
|
||||
|
||||
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
|
||||
const size_t n_probs = slot.task->params.sampling.n_probs;
|
||||
const size_t n_probs_request = slot.task->params.sampling.n_probs;
|
||||
|
||||
if (post_sampling) {
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true);
|
||||
const size_t max_probs = cur_p->size;
|
||||
const size_t n_probs = std::min(max_probs, n_probs_request);
|
||||
|
||||
// set probability for sampled token
|
||||
for (size_t i = 0; i < max_probs; i++) {
|
||||
@@ -1341,8 +1345,8 @@ private:
|
||||
}
|
||||
|
||||
// set probability for top n_probs tokens
|
||||
result.probs.reserve(max_probs);
|
||||
for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
|
||||
result.probs.reserve(n_probs);
|
||||
for (size_t i = 0; i < n_probs; i++) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
common_token_to_piece(ctx, cur_p->data[i].id, special),
|
||||
@@ -1352,9 +1356,11 @@ private:
|
||||
} else {
|
||||
// TODO: optimize this with min-p optimization
|
||||
std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
|
||||
const size_t max_probs = cur.size();
|
||||
const size_t n_probs = std::min(max_probs, n_probs_request);
|
||||
|
||||
// set probability for sampled token
|
||||
for (size_t i = 0; i < cur.size(); i++) {
|
||||
for (size_t i = 0; i < max_probs; i++) {
|
||||
// set probability for sampled token
|
||||
if (cur[i].id == result.tok) {
|
||||
result.prob = cur[i].p;
|
||||
@@ -1364,7 +1370,7 @@ private:
|
||||
|
||||
// set probability for top n_probs tokens
|
||||
result.probs.reserve(n_probs);
|
||||
for (size_t i = 0; i < std::min(cur.size(), n_probs); i++) {
|
||||
for (size_t i = 0; i < n_probs; i++) {
|
||||
result.probs.push_back({
|
||||
cur[i].id,
|
||||
common_token_to_piece(ctx, cur[i].id, special),
|
||||
@@ -1585,32 +1591,14 @@ private:
|
||||
|
||||
// tokenize the input if it's set by CLI, return false on error
|
||||
bool tokenize_cli_input(server_task & task) {
|
||||
GGML_ASSERT(task.cli_input != nullptr);
|
||||
try {
|
||||
auto & opt = oai_parser_opt;
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
|
||||
inputs.tools = {}; // TODO
|
||||
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
inputs.json_schema = ""; // TODO
|
||||
inputs.grammar = ""; // TODO
|
||||
inputs.use_jinja = opt.use_jinja;
|
||||
inputs.parallel_tool_calls = false;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.reasoning_format = opt.reasoning_format;
|
||||
inputs.enable_thinking = opt.enable_thinking;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
|
||||
|
||||
// tokenize the resulting prompt
|
||||
auto & prompt = chat_params.prompt;
|
||||
auto & prompt = task.cli_prompt;
|
||||
if (mctx != nullptr) {
|
||||
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
|
||||
} else {
|
||||
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
|
||||
}
|
||||
task.cli_input.clear();
|
||||
task.cli_prompt.clear();
|
||||
task.cli_files.clear();
|
||||
} catch (const std::exception & e) {
|
||||
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -1686,7 +1674,7 @@ private:
|
||||
{
|
||||
// special case: if input is provided via CLI, tokenize it first
|
||||
// otherwise, no need to tokenize as it's already done inside the HTTP thread
|
||||
if (task.cli_input != nullptr) {
|
||||
if (task.cli) {
|
||||
if (!tokenize_cli_input(task)) {
|
||||
break;
|
||||
}
|
||||
@@ -2898,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
|
||||
}
|
||||
|
||||
server_context_meta server_context::get_meta() const {
|
||||
auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
|
||||
|
||||
auto bos_id = llama_vocab_bos(impl->vocab);
|
||||
auto eos_id = llama_vocab_eos(impl->vocab);
|
||||
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
|
||||
@@ -2910,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
|
||||
/* model_name */ impl->model_name,
|
||||
/* model_path */ impl->params_base.model.path,
|
||||
/* has_mtmd */ impl->mctx != nullptr,
|
||||
/* has_inp_image */ impl->oai_parser_opt.allow_image,
|
||||
/* has_inp_audio */ impl->oai_parser_opt.allow_audio,
|
||||
/* has_inp_image */ impl->chat_params.allow_image,
|
||||
/* has_inp_audio */ impl->chat_params.allow_audio,
|
||||
/* json_webui_settings */ impl->json_webui_settings,
|
||||
/* slot_n_ctx */ impl->get_slot_n_ctx(),
|
||||
/* pooling_type */ llama_pooling_type(impl->ctx),
|
||||
|
||||
/* chat_template */ common_chat_templates_source(impl->chat_templates.get()),
|
||||
/* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
|
||||
/* chat_params */ impl->chat_params,
|
||||
|
||||
/* bos_token_str */ bos_token_str,
|
||||
/* eos_token_str */ eos_token_str,
|
||||
@@ -3199,8 +3184,8 @@ void server_routes::init_routes() {
|
||||
|
||||
// this endpoint can be accessed during sleeping
|
||||
// the next LOC is to avoid someone accidentally use ctx_server
|
||||
bool server_ctx; // do NOT delete this line
|
||||
GGML_UNUSED(server_ctx);
|
||||
bool ctx_server; // do NOT delete this line
|
||||
GGML_UNUSED(ctx_server);
|
||||
|
||||
res->ok({{"status", "ok"}});
|
||||
return res;
|
||||
@@ -3390,8 +3375,8 @@ void server_routes::init_routes() {
|
||||
|
||||
// this endpoint can be accessed during sleeping
|
||||
// the next LOC is to avoid someone accidentally use ctx_server
|
||||
bool server_ctx; // do NOT delete this line
|
||||
GGML_UNUSED(server_ctx);
|
||||
bool ctx_server; // do NOT delete this line
|
||||
GGML_UNUSED(ctx_server);
|
||||
|
||||
task_params tparams;
|
||||
tparams.sampling = params.sampling;
|
||||
@@ -3400,6 +3385,9 @@ void server_routes::init_routes() {
|
||||
{ "n_ctx", meta->slot_n_ctx },
|
||||
};
|
||||
|
||||
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
|
||||
std::string tmpl_tools = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
|
||||
|
||||
json props = {
|
||||
{ "default_generation_settings", default_generation_settings_for_props },
|
||||
{ "total_slots", params.n_parallel },
|
||||
@@ -3414,15 +3402,15 @@ void server_routes::init_routes() {
|
||||
{ "endpoint_metrics", params.endpoint_metrics },
|
||||
{ "webui", params.webui },
|
||||
{ "webui_settings", meta->json_webui_settings },
|
||||
{ "chat_template", meta->chat_template },
|
||||
{ "chat_template", tmpl_default },
|
||||
{ "bos_token", meta->bos_token_str },
|
||||
{ "eos_token", meta->eos_token_str },
|
||||
{ "build_info", meta->build_info },
|
||||
{ "is_sleeping", queue_tasks.is_sleeping() },
|
||||
};
|
||||
if (params.use_jinja) {
|
||||
if (!meta->chat_template_tool_use.empty()) {
|
||||
props["chat_template_tool_use"] = meta->chat_template_tool_use;
|
||||
if (!tmpl_tools.empty()) {
|
||||
props["chat_template_tool_use"] = tmpl_tools;
|
||||
}
|
||||
}
|
||||
res->ok(props);
|
||||
@@ -3443,6 +3431,7 @@ void server_routes::init_routes() {
|
||||
|
||||
this->get_api_show = [this](const server_http_req &) {
|
||||
auto res = create_response();
|
||||
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
|
||||
json data = {
|
||||
{
|
||||
"model_info", {
|
||||
@@ -3451,7 +3440,7 @@ void server_routes::init_routes() {
|
||||
},
|
||||
{"modelfile", ""},
|
||||
{"parameters", ""},
|
||||
{"template", meta->chat_template},
|
||||
{"template", tmpl_default},
|
||||
{"details", {
|
||||
{"parent_model", ""},
|
||||
{"format", "gguf"},
|
||||
@@ -3576,7 +3565,7 @@ void server_routes::init_routes() {
|
||||
json body = json::parse(req.body);
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
meta->chat_params,
|
||||
files);
|
||||
return handle_completions_impl(
|
||||
req,
|
||||
@@ -3592,7 +3581,7 @@ void server_routes::init_routes() {
|
||||
json body = convert_anthropic_to_oai(json::parse(req.body));
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
meta->chat_params,
|
||||
files);
|
||||
return handle_completions_impl(
|
||||
req,
|
||||
@@ -3608,7 +3597,7 @@ void server_routes::init_routes() {
|
||||
json body = convert_anthropic_to_oai(json::parse(req.body));
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
meta->chat_params,
|
||||
files);
|
||||
|
||||
json prompt = body_parsed.at("prompt");
|
||||
@@ -3624,7 +3613,7 @@ void server_routes::init_routes() {
|
||||
json body = json::parse(req.body);
|
||||
json data = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
meta->chat_params,
|
||||
files);
|
||||
res->ok({{ "prompt", std::move(data.at("prompt")) }});
|
||||
return res;
|
||||
@@ -3635,8 +3624,8 @@ void server_routes::init_routes() {
|
||||
|
||||
// this endpoint can be accessed during sleeping
|
||||
// the next LOC is to avoid someone accidentally use ctx_server
|
||||
bool server_ctx; // do NOT delete this line
|
||||
GGML_UNUSED(server_ctx);
|
||||
bool ctx_server; // do NOT delete this line
|
||||
GGML_UNUSED(ctx_server);
|
||||
|
||||
json models = {
|
||||
{"models", {
|
||||
|
||||
@@ -20,9 +20,8 @@ struct server_context_meta {
|
||||
int slot_n_ctx;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
// chat template
|
||||
std::string chat_template;
|
||||
std::string chat_template_tool_use;
|
||||
// chat params
|
||||
server_chat_params & chat_params;
|
||||
|
||||
// tokens
|
||||
std::string bos_token_str;
|
||||
|
||||
@@ -130,8 +130,10 @@ struct server_task {
|
||||
task_params params;
|
||||
server_tokens tokens;
|
||||
|
||||
// only used by CLI, this delegates the tokenization to the server
|
||||
json cli_input = nullptr;
|
||||
// only used by CLI, this allow tokenizing CLI inputs on server side
|
||||
// we need this because mtmd_context and vocab are not accessible outside of server_context
|
||||
bool cli = false;
|
||||
std::string cli_prompt;
|
||||
std::vector<raw_buffer> cli_files;
|
||||
|
||||
server_task_type type;
|
||||
|
||||
Reference in New Issue
Block a user