Compare commits

...

10 Commits

Author SHA1 Message Date
Georgi Gerganov 4f81b33e32 llama : validate seq id batch input (#13809)
* llama : validate seq id batch input

ggml-ci

* cont : fix the fix

ggml-ci
2025-05-27 09:40:59 +03:00
Olivier Chafik cdf94a1802 server: --offline mode (#13804)
* server: --offline mode (env: LLAMA_OFFLINE)

---------

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2025-05-26 22:34:27 +01:00
Georgi Gerganov a26c4cc11e scripts : add option to compare commits in Debug (#13806)
* scripts : add option to compare commits in Debug

* cont : reuse existing CMAKE_OPTS
2025-05-26 22:24:01 +03:00
Georgi Gerganov 4265a87b59 cuda : avoid cuGetErrorString (#13791)
ggml-ci
2025-05-26 22:14:52 +03:00
Akarshan Biswas 6f180b915c SYCL: Add non contiguous support in RMS_NORM and NORM kernels (#13611)
* SYCL: Add non contiguous input support to norm kernel

* refactor and add RMS_NORM non contiguous input support

ggml-ci

* restore subgroup reduction for multi-subgroup thread blocks in norm kernels

* Swap grid dims of nsamples and nrows

ggml-ci

* Revert "Swap grid dims of nsamples and nrows"

This reverts commit 43be2d657fec7f7fba54e2cd154106bc0fc45adf.

* restore not required changes
ggml-ci

* address review comments: change it to more like SYCL

* Use a common function to calculate offset

* remove wrap around logic for handling broadcasts

* remove static from calculate_offset fn and use ceil_div
2025-05-26 21:10:36 +05:30
Olivier Chafik 03f582ae8f server: fix streaming crashes (#13786)
* add preludes to content on partial regex match

* allow all parsers to parse non-tool-call content.

* tweak order of <|python_tag|> vs <function= parsing for functionary v3.1 format. still not ideal but hopefully less prone to crash
2025-05-26 16:03:57 +01:00
standby24x7 88c125f2ac examples/training: Fix file name in README (#13803)
This patch fixes binary file names in README.md.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
2025-05-26 16:55:24 +02:00
Olivier Chafik d74e94c1b3 server: fix format of streamed tool call deltas (diff name, fix id location) (#13800)
* fix deltas of tool_call.function.name

* fix tool_call.id (was in tool_call.function.id!) + add function type

* add tool_call.type

* populate empty tool_call.function.arguments on first delta
2025-05-26 14:56:49 +01:00
Olivier Chafik f13847cfb5 server: fix regression on streamed non-chat completion w/ stops (#13785)
* more forgiving message diffs: partial stop words aren't erased, full stops are

* Add (slow) server test for completion + stream + stop
2025-05-26 14:16:37 +01:00
Georgi Gerganov 79c137f776 examples : allow extracting embeddings from decoder contexts (#13797)
ggml-ci
2025-05-26 14:03:54 +03:00
20 changed files with 422 additions and 271 deletions
+126 -106
View File
@@ -242,7 +242,56 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
}
// download one single file from remote URL to local path
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
std::string etag;
std::string last_modified;
if (file_exists) {
if (offline) {
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
return true; // skip verification/downloading
}
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
std::ifstream metadata_in(metadata_path);
if (metadata_in.good()) {
try {
metadata_in >> metadata;
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
}
}
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
} else {
if (offline) {
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
return false;
}
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
// Send a HEAD request to retrieve the etag and last-modified headers
struct common_load_model_from_url_headers {
std::string etag;
std::string last_modified;
};
common_load_model_from_url_headers headers;
bool head_request_ok = false;
bool should_download = !file_exists; // by default, we should download if the file does not exist
// Initialize libcurl
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
@@ -269,91 +318,47 @@ static bool common_download_file_single(const std::string & url, const std::stri
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
std::string etag;
std::string last_modified;
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
if (file_exists) {
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
std::ifstream metadata_in(metadata_path);
if (metadata_in.good()) {
try {
metadata_in >> metadata;
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
std::string header(buffer, n_items);
std::smatch match;
if (std::regex_match(header, match, header_regex)) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, etag_regex)) {
headers->etag = value;
} else if (std::regex_match(key, match, last_modified_regex)) {
headers->last_modified = value;
}
}
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
// Send a HEAD request to retrieve the etag and last-modified headers
struct common_load_model_from_url_headers {
std::string etag;
std::string last_modified;
return n_items;
};
common_load_model_from_url_headers headers;
bool head_request_ok = false;
bool should_download = !file_exists; // by default, we should download if the file does not exist
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
// get ETag to see if the remote file has changed
{
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
// we only allow retrying once for HEAD requests
// this is for the use case of using running offline (no internet), retrying can be annoying
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
if (!was_perform_successful) {
head_request_ok = false;
}
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
std::string header(buffer, n_items);
std::smatch match;
if (std::regex_match(header, match, header_regex)) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, etag_regex)) {
headers->etag = value;
} else if (std::regex_match(key, match, last_modified_regex)) {
headers->last_modified = value;
}
}
return n_items;
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
// we only allow retrying once for HEAD requests
// this is for the use case of using running offline (no internet), retrying can be annoying
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
if (!was_perform_successful) {
head_request_ok = false;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code == 200) {
head_request_ok = true;
} else {
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
head_request_ok = false;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code == 200) {
head_request_ok = true;
} else {
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
head_request_ok = false;
}
// if head_request_ok is false, we don't have the etag or last-modified headers
@@ -460,12 +465,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (auto const & item : urls) {
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
return common_download_file_single(it.first, it.second, bearer_token);
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
return common_download_file_single(it.first, it.second, bearer_token, offline);
}, item));
}
@@ -481,14 +486,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
static bool common_download_model(
const common_params_model & model,
const std::string & bearer_token) {
const std::string & bearer_token,
bool offline) {
// Basic validation of the model.url
if (model.url.empty()) {
LOG_ERR("%s: invalid model url\n", __func__);
return false;
}
if (!common_download_file_single(model.url, model.path, bearer_token)) {
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
return false;
}
@@ -547,7 +553,7 @@ static bool common_download_model(
}
// Download in parallel
common_download_file_multiple(urls, bearer_token);
common_download_file_multiple(urls, bearer_token, offline);
}
return true;
@@ -608,7 +614,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
@@ -638,20 +644,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
long res_code = 0;
std::string res_str;
bool use_cache = false;
try {
auto res = common_remote_get_content(url, params);
res_code = res.first;
res_str = std::string(res.second.data(), res.second.size());
} catch (const std::exception & e) {
LOG_WRN("error: failed to get manifest: %s\n", e.what());
LOG_WRN("try reading from cache\n");
// try to read from cache
if (!offline) {
try {
auto res = common_remote_get_content(url, params);
res_code = res.first;
res_str = std::string(res.second.data(), res.second.size());
} catch (const std::exception & e) {
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
}
}
if (res_code == 0) {
if (std::filesystem::exists(cached_response_path)) {
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
res_str = read_file(cached_response_path);
res_code = 200;
use_cache = true;
} catch (const std::exception & e) {
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
} else {
throw std::runtime_error(
offline ? "error: failed to get manifest (offline mode)"
: "error: failed to get manifest (check your internet connection)");
}
}
std::string ggufFile;
@@ -698,24 +709,25 @@ bool common_has_curl() {
return false;
}
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from internet\n");
return false;
}
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return false;
}
static bool common_download_model(
const common_params_model &,
const std::string &) {
const std::string &,
bool) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return false;
}
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return {};
}
@@ -742,7 +754,8 @@ struct handle_model_result {
static handle_model_result common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
const std::string & model_path_default) {
const std::string & model_path_default,
bool offline) {
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
{
@@ -750,7 +763,7 @@ static handle_model_result common_params_handle_model(
// short-hand to avoid specifying --hf-file -> default it to --model
if (model.hf_file.empty()) {
if (model.path.empty()) {
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1); // built without CURL, error message already printed
}
@@ -791,7 +804,7 @@ static handle_model_result common_params_handle_model(
// then, download it if needed
if (!model.url.empty()) {
bool ok = common_download_model(model, bearer_token);
bool ok = common_download_model(model, bearer_token, offline);
if (!ok) {
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
exit(1);
@@ -934,7 +947,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// handle model and download
{
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -944,12 +957,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// only download mmproj if the current example is using it
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, "");
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
break;
}
}
common_params_handle_model(params.speculative.model, params.hf_token, "");
common_params_handle_model(params.vocoder.model, params.hf_token, "");
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
}
if (params.escape) {
@@ -2996,6 +3009,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
common_log_set_verbosity_thold(INT_MAX);
}
));
add_opt(common_arg(
{"--offline"},
"Offline mode: forces use of cache, prevents network access",
[](common_params & params) {
params.offline = true;
}
).set_env("LLAMA_OFFLINE"));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
+7 -4
View File
@@ -170,20 +170,23 @@ std::string common_chat_msg_parser::consume_rest() {
}
// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from) {
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
return std::nullopt;
}
auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
pos_ = m.groups[0].end;
if (add_prelude_to_content) {
add_content(prelude);
}
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
if (is_partial()) {
throw common_chat_msg_partial_exception(regex.str());
}
return std::nullopt;
}
auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
pos_ = m.groups[0].end;
return find_regex_result{prelude, m.groups};
}
+2 -1
View File
@@ -30,6 +30,7 @@ class common_chat_msg_parser {
const std::string & healing_marker() const { return healing_marker_; }
const bool & is_partial() const { return is_partial_; }
const common_chat_msg & result() const { return result_; }
const common_chat_syntax & syntax() const { return syntax_; }
void move_to(size_t pos) {
if (pos > input_.size()) {
@@ -77,7 +78,7 @@ class common_chat_msg_parser {
std::vector<common_string_range> groups;
};
std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos);
std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
bool try_consume_literal(const std::string & literal);
+57 -38
View File
@@ -31,6 +31,11 @@ static std::string string_diff(const std::string & last, const std::string & cur
return current;
}
if (!string_starts_with(current, last)) {
if (string_starts_with(last, current)) {
// This happens if the last generation ended on a partial stop word (not erased),
// and the current ended on a stop word (erased).
return "";
}
throw std::runtime_error("Invalid diff: '" + last + "' not found at start of '" + current + "'");
}
return current.substr(last.size());
@@ -101,9 +106,9 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
if (!args_diff.empty() || pref.id != newf.id) {
auto & diff = diffs.emplace_back();
diff.tool_call_index = idx;
diff.tool_call_delta.name = newf.name;
if (pref.id != newf.id) {
diff.tool_call_delta.id = newf.id;
diff.tool_call_delta.name = newf.name;
}
diff.tool_call_delta.arguments = args_diff;
}
@@ -387,22 +392,19 @@ template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_di
delta["content"] = diff.content_delta;
}
if (diff.tool_call_index != std::string::npos) {
json tool_call;
tool_call["index"] = diff.tool_call_index;
if (!diff.tool_call_delta.id.empty()) {
tool_call["id"] = diff.tool_call_delta.id;
tool_call["type"] = "function";
}
json function = json::object();
if (!diff.tool_call_delta.name.empty()) {
function["name"] = diff.tool_call_delta.name;
}
if (!diff.tool_call_delta.id.empty()) {
function["id"] = diff.tool_call_delta.id;
}
if (!diff.tool_call_delta.arguments.empty()) {
function["arguments"] = diff.tool_call_delta.arguments;
}
delta["tool_calls"] = json::array({
json {
{"index", diff.tool_call_index},
{"function", function}
}
});
function["arguments"] = diff.tool_call_delta.arguments;
tool_call["function"] = function;
delta["tool_calls"] = json::array({tool_call});
}
return delta;
}
@@ -654,7 +656,6 @@ static void parse_json_tool_calls(
}
from = std::string::npos;
builder.add_content(res->prelude);
auto maybe_raw_python = name == "python" && allow_raw_python;
if (builder.input()[builder.pos()] == '{' || !maybe_raw_python) {
if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) {
@@ -684,7 +685,6 @@ static void parse_json_tool_calls(
};
if (block_open) {
if (auto res = builder.try_find_regex(*block_open)) {
builder.add_content(res->prelude);
parse_tool_calls();
} else {
builder.add_content(builder.consume_rest());
@@ -697,7 +697,6 @@ static void parse_json_tool_calls(
static void parse_prefixed_json_tool_call_array(common_chat_msg_parser & builder, const common_regex & prefix, size_t rstrip_prefix = 0) {
static const std::vector<std::vector<std::string>> args_paths = {{"arguments"}};
if (auto res = builder.try_find_regex(prefix)) {
builder.add_content(res->prelude);
builder.move_back(rstrip_prefix);
auto tool_calls = builder.consume_json_with_dumped_args(args_paths);
if (!builder.add_tool_calls(tool_calls.value) || tool_calls.is_partial) {
@@ -833,6 +832,10 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
return data;
}
static void common_chat_parse_generic(common_chat_msg_parser & builder) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
static const std::vector<std::vector<std::string>> content_paths = {
{"response"},
};
@@ -905,6 +908,11 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
return data;
}
static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
parse_prefixed_json_tool_call_array(builder, prefix);
}
@@ -999,7 +1007,6 @@ static void common_chat_parse_command_r7b(common_chat_msg_parser & builder) {
if (auto res = builder.try_find_regex(start_action_regex)) {
// If we didn't extract thoughts, prelude includes them.
builder.add_content(res->prelude);
auto tool_calls = builder.consume_json_with_dumped_args({{"parameters"}});
for (const auto & tool_call : tool_calls.value) {
std::string name = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
@@ -1014,11 +1021,7 @@ static void common_chat_parse_command_r7b(common_chat_msg_parser & builder) {
}
builder.consume_regex(end_action_regex);
} else if (auto res = builder.try_find_regex(start_response_regex)) {
// If we didn't extract thoughts, prelude includes them.
builder.add_content(res->prelude);
if (auto res = builder.try_find_regex(end_response_regex)) {
builder.add_content(res->prelude);
} else {
if (!builder.try_find_regex(end_response_regex)) {
builder.add_content(builder.consume_rest());
throw common_chat_msg_partial_exception(end_response_regex.str());
}
@@ -1126,6 +1129,11 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
return data;
}
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
static const common_regex function_regex(
"\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
static const common_regex close_regex("\\}\\s*");
@@ -1136,8 +1144,6 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
if (with_builtin_tools) {
static const common_regex builtin_call_regex("<\\|python_tag\\|>");
if (auto res = builder.try_find_regex(builtin_call_regex)) {
builder.add_content(res->prelude);
auto fun_res = builder.consume_regex(function_name_regex);
auto function_name = builder.str(fun_res.groups[1]);
@@ -1253,6 +1259,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
}
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
static const common_regex tool_calls_begin("(?:<tool▁calls▁begin>|<tool_calls_begin>|<tool calls begin>|<tool\\\\_calls\\\\_begin>|<tool▁calls>)");
static const common_regex tool_calls_end("<tool▁calls▁end>");
@@ -1314,6 +1324,10 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
return data;
}
static void common_chat_parse_firefunction_v2(common_chat_msg_parser & builder) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
static const common_regex prefix(regex_escape(" functools["));
parse_prefixed_json_tool_call_array(builder, prefix, /* rstrip_prefix= */ 1);
}
@@ -1455,15 +1469,12 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
return data;
}
static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser & builder) {
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
static const common_regex python_tag_regex(regex_escape("<|python_tag|>"));
if (auto res = builder.try_find_regex(python_tag_regex)) {
builder.add_content(res->prelude);
auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
builder.add_tool_call("python", "", arguments);
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
static const common_regex python_tag_regex(regex_escape("<|python_tag|>"));
static const common_regex function_regex(R"(<function=(\w+)>)");
static const common_regex close_regex(R"(</function>)");
@@ -1475,6 +1486,12 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
function_regex,
close_regex,
std::nullopt);
if (auto res = builder.try_find_regex(python_tag_regex)) {
auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
builder.add_tool_call("python", "", arguments);
return;
}
}
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1593,6 +1610,10 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
}
static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
static const common_regex open_regex(
"(?:"
@@ -1614,8 +1635,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
);
if (auto res = builder.try_find_regex(open_regex)) {
builder.add_content(res->prelude);
const auto & block_start = res->groups[1];
std::string block_end = block_start.empty() ? "" : "```";
@@ -1851,10 +1870,10 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.add_content(builder.consume_rest());
}
static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) {
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format), builder.input().c_str());
static void common_chat_parse(common_chat_msg_parser & builder) {
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format), builder.input().c_str());
switch (format) {
switch (builder.syntax().format) {
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
common_chat_parse_content_only(builder);
break;
@@ -1889,7 +1908,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form
common_chat_parse_command_r7b(builder);
break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(format));
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}
builder.finish();
}
@@ -1897,7 +1916,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
common_chat_msg_parser builder(input, is_partial, syntax);
try {
common_chat_parse(builder, syntax.format);
common_chat_parse(builder);
} catch (const common_chat_msg_partial_exception & ex) {
LOG_DBG("Partial parse: %s\n", ex.what());
if (!is_partial) {
+1
View File
@@ -144,6 +144,7 @@ struct common_chat_syntax {
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
bool thinking_forced_open = false;
bool parse_tool_calls = true;
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+1
View File
@@ -291,6 +291,7 @@ struct common_params {
int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
bool offline = false;
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+2 -2
View File
@@ -41,8 +41,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_encode(ctx, batch) < 0) {
LOG_ERR("%s : failed to encode\n", __func__);
if (llama_decode(ctx, batch) < 0) {
LOG_ERR("%s : failed to process\n", __func__);
}
for (int i = 0; i < batch.n_tokens; i++) {
+6 -6
View File
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
}
}
static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_self_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_encode(ctx, batch) < 0) {
LOG_ERR("%s : failed to encode\n", __func__);
if (llama_decode(ctx, batch) < 0) {
LOG_ERR("%s : failed to process\n", __func__);
}
for (int i = 0; i < batch.n_tokens; i++) {
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd;
batch_encode(ctx, batch, out, s, n_embd);
batch_process(ctx, batch, out, s, n_embd);
common_batch_clear(batch);
p += s;
s = 0;
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
// final batch
float * out = emb + p * n_embd;
batch_encode(ctx, batch, out, s, n_embd);
batch_process(ctx, batch, out, s, n_embd);
// save embeddings to chunks
for (int i = 0; i < n_chunks; i++) {
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
batch_add_seq(query_batch, query_tokens, 0);
std::vector<float> query_emb(n_embd, 0);
batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
common_batch_clear(query_batch);
+2 -2
View File
@@ -10,8 +10,8 @@ Proof of concept:
``` sh
export model_name=llama_3.2-1b && export quantization=f32
./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
./build/bin/llama-finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
./build/bin/llama-perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
```
The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
+1 -1
View File
@@ -168,7 +168,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#if !defined(GGML_USE_HIP)
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
static const char * cu_get_error_str(CUresult err) {
const char * err_str;
cuGetErrorString(err, &err_str);
+14
View File
@@ -13,6 +13,7 @@
#ifndef GGML_SYCL_COMMON_HPP
#define GGML_SYCL_COMMON_HPP
#include <cstddef>
#include <fstream>
#include <iostream>
#include <string>
@@ -481,6 +482,19 @@ static __dpct_inline__ float warp_reduce_max(float x,
return x;
}
/* Helper for Computing the linear offset of a ggml_tensor given
per-dimension sizes, strides, and indices */
template<int N>
__dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
size_t offset = 0;
#pragma unroll
for (int i = 0; i < N; i++) {
auto index_i = indices[i];
offset += strides[i] * index_i;
}
return offset;
}
// Helper for vec loading aligned data
template <typename Tp, int n>
inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
+1
View File
@@ -4241,6 +4241,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
#endif
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
return true;
case GGML_OP_L2_NORM:
case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]);
+94 -67
View File
@@ -1,40 +1,50 @@
#include "norm.hpp"
#include "ggml-sycl/common.hpp"
#include "ggml-sycl/presets.hpp"
static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);
const int tid = item_ct1.get_local_id(2);
static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
const int nrows = item_ct1.get_group_range(2);
const int nchannels = item_ct1.get_group_range(1);
const int nthreads = item_ct1.get_local_range(2);
const int sample = item_ct1.get_group(0);
const int channel = item_ct1.get_group(1);
const int row = item_ct1.get_group(2);
const int tid = item_ct1.get_local_id(2);
const int nwarps = nthreads / WARP_SIZE;
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
x += strided_offset;
dst += packed_offset;
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
for (int col = tid; col < ncols; col += block_size) {
const float xi = x[row * ncols + col];
const float xi = x[col];
mean_var.x() += xi;
mean_var.y() += xi * xi;
}
// sum up partial sums
mean_var = warp_reduce_sum(mean_var, item_ct1);
if (block_size > WARP_SIZE) {
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = mean_var;
if (block_size > WARP_SIZE) {
const auto sub_group = item_ct1.get_sub_group();
const auto sg_id = sub_group.get_group_linear_id();
const auto wi_in_sg = sub_group.get_local_linear_id();
if (wi_in_sg == 0) {
s_sum[sg_id] = mean_var;
}
/*
DPCT1118:0: SYCL group functions and algorithms must be encountered in
converged control flow. You may need to adjust the code.
*/
item_ct1.barrier(sycl::access::fence_space::local_space);
mean_var = 0.f;
size_t nreduce = nwarps / WARP_SIZE;
const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
for (size_t i = 0; i < nreduce; i += 1)
{
mean_var += s_sum[lane_id + i * WARP_SIZE];
mean_var += s_sum[wi_in_sg + i * WARP_SIZE];
}
mean_var = warp_reduce_sum(mean_var, item_ct1);
}
@@ -44,7 +54,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
const float inv_std = sycl::rsqrt(var + eps);
for (int col = tid; col < ncols; col += block_size) {
dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
dst[col] = (x[col] - mean) * inv_std;
}
}
@@ -135,39 +145,51 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
}
}
static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);
const int tid = item_ct1.get_local_id(2);
static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
const int nrows = item_ct1.get_group_range(2);
const int nchannels = item_ct1.get_group_range(1);
const int sample = item_ct1.get_group(0);
const int channel = item_ct1.get_group(1);
const int row = item_ct1.get_group(2);
const int nthreads = item_ct1.get_local_range(2);
const int tid = item_ct1.get_local_id(2);
const int nwarps = nthreads / WARP_SIZE;
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
x += strided_offset;
dst += packed_offset;
float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += block_size) {
const float xi = x[row * ncols + col];
const float xi = x[col];
tmp += xi * xi;
}
// sum up partial sums
tmp = warp_reduce_sum(tmp, item_ct1);
if (block_size > WARP_SIZE) {
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
const auto sub_group = item_ct1.get_sub_group();
const auto sg_id = sub_group.get_group_linear_id();
const auto wi_in_sg = sub_group.get_local_linear_id();
if (wi_in_sg == 0) {
s_sum[sg_id] = tmp;
}
/*
DPCT1118:3: SYCL group functions and algorithms must be encountered in
converged control flow. You may need to adjust the code.
*/
item_ct1.barrier(sycl::access::fence_space::local_space);
size_t nreduce = nwarps / WARP_SIZE;
const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
tmp = 0.f;
for (size_t i = 0; i < nreduce; i += 1)
{
tmp += s_sum[lane_id + i * WARP_SIZE];
tmp += s_sum[wi_in_sg + i * WARP_SIZE];
}
tmp = warp_reduce_sum(tmp, item_ct1);
}
@@ -176,7 +198,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
const float scale = sycl::rsqrt(mean + eps);
for (int col = tid; col < ncols; col += block_size) {
dst[row * ncols + col] = scale * x[row * ncols + col];
dst[col] = scale * x[col];
}
}
@@ -224,20 +246,20 @@ static void l2_norm_f32(const float* x, float* dst, const int ncols, const float
}
}
static void norm_f32_sycl(const float* x, float* dst, const int ncols,
const int nrows, const float eps,
queue_ptr stream, int device) {
static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
const float eps, queue_ptr stream, int device) {
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) {
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
stream->submit([&](sycl::handler& cgh) {
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
block_dims),
sycl::nd_range<3>(global_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
norm_f32(x, dst, ncols, eps, item_ct1,
nullptr, WARP_SIZE);
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
});
});
}
@@ -252,15 +274,12 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
*/
stream->submit([&](sycl::handler& cgh) {
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
block_dims),
sycl::nd_range<3>(global_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
norm_f32(x, dst, ncols, eps, item_ct1,
get_pointer(s_sum_acc_ct1), work_group_size);
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
});
});
}
@@ -313,21 +332,20 @@ static void group_norm_f32_sycl(const float* x, float* dst,
}
}
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
const int nrows, const float eps,
queue_ptr stream, int device) {
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
if (ncols < 1024) {
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
stream->submit([&](sycl::handler& cgh) {
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
block_dims),
sycl::nd_range<3>(global_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
rms_norm_f32(x, dst, ncols, eps, item_ct1,
nullptr, WARP_SIZE);
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
});
});
}
@@ -344,12 +362,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
cgh);
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
block_dims),
sycl::nd_range<3>(global_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
rms_norm_f32(x, dst, ncols, eps, item_ct1,
get_pointer(s_sum_acc_ct1), work_group_size);
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
});
});
}
@@ -398,12 +414,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
}
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int64_t ne00 = dst->src[0]->ne[0];
const int64_t nrows = ggml_nrows(dst->src[0]);
GGML_TENSOR_UNARY_OP_LOCALS
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
@@ -411,8 +427,14 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
const size_t ts0 = ggml_type_size(src0->type);
GGML_ASSERT(nb00 == ts0);
const int64_t s01 = nb01 / ts0;
const int64_t s02 = nb02 / ts0;
const int64_t s03 = nb03 / ts0;
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
}
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
@@ -436,11 +458,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int64_t ne00 = dst->src[0]->ne[0];
const int64_t nrows = ggml_nrows(dst->src[0]);
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
@@ -450,7 +471,13 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
GGML_TENSOR_UNARY_OP_LOCALS
const size_t ts0 = ggml_type_size(src0->type);
GGML_ASSERT(nb00 == ts0);
const int64_t s01 = nb01 / ts0;
const int64_t s02 = nb02 / ts0;
const int64_t s03 = nb03 / ts0;
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
}
void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
+2 -2
View File
@@ -17,14 +17,14 @@ rm -f llama-bench.sqlite > /dev/null
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
if [ -n "$GGML_CUDA" ]; then
cmake_opts="-DGGML_CUDA=ON"
CMAKE_OPTS="${CMAKE_OPTS} -DGGML_CUDA=ON"
fi
dir="build-bench"
function run {
rm -fr ${dir} > /dev/null
cmake -B ${dir} -S . $cmake_opts > /dev/null
cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
cmake --build ${dir} -t llama-bench > /dev/null
${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
}
+14 -2
View File
@@ -693,12 +693,18 @@ int llama_context::encode(llama_batch & inp_batch) {
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
// TODO: move the validation to the llama_batch_allocr
if (batch.token) {
for (int32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1;
}
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
throw -1;
}
}
}
@@ -852,7 +858,7 @@ int llama_context::encode(llama_batch & inp_batch) {
int llama_context::decode(llama_batch & inp_batch) {
if (!memory) {
LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
return encode(inp_batch);
}
@@ -887,11 +893,17 @@ int llama_context::decode(llama_batch & inp_batch) {
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
// TODO: move the validation to the llama_batch_allocr
if (batch.token) {
for (int64_t i = 0; i < n_tokens_all; ++i) {
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
throw std::runtime_error("invalid token");
return -1;
}
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
return -1;
}
}
}
+55 -29
View File
@@ -401,9 +401,12 @@ static common_chat_msg simple_assist_msg(const std::string & content, const std:
}
return msg;
}
const common_chat_msg message_assist = simple_assist_msg("Hello, world!\nWhat's up?");
const common_chat_msg message_assist_empty = simple_assist_msg("");
const common_chat_msg message_assist_thoughts_unparsed_deepseek = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?");
const common_chat_msg message_assist = simple_assist_msg("Hello, world!\nWhat's up?");
const common_chat_msg message_assist_empty = simple_assist_msg("");
const common_chat_msg message_assist_thoughts_unparsed_deepseek = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?");
const common_chat_msg message_assist_thoughts_unparsed_md = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```");
const common_chat_msg message_assist_thoughts_unparsed_md_partial = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}");
const common_chat_msg message_assist_thoughts_unparsed_r7b = simple_assist_msg("<|START_THINKING|>I'm\nthinking<|END_THINKING|>Hello, world!\nWhat's up?");
const common_chat_msg message_assist_thoughts = simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking");
const common_chat_msg message_assist_thoughts_unopened_unparsed = simple_assist_msg("I'm\nthinking</think>Hello, world!\nWhat's up?");
@@ -591,8 +594,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
common_chat_parse(
@@ -619,8 +620,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_call_idx,
common_chat_parse(
@@ -632,8 +631,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_no_content,
common_chat_parse(
@@ -644,8 +641,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
test_templates(tmpls.get(), end_tokens, message_assist_call_idx, tools,
@@ -675,6 +670,18 @@ static void test_template_output_parsers() {
// Generic tool calls doesn't generate / parse content-only messages symmetrically.
assert_equals(
simple_assist_msg("{ \"tool_call\" : { \"name\" : \"t"),
common_chat_parse(
"{ \"tool_call\" : { \"name\" : \"t",
/* is_partial= */ true,
{
/* .format = */ COMMON_CHAT_FORMAT_GENERIC,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ false,
}));
assert_equals(
message_assist_empty,
common_chat_parse(
@@ -776,11 +783,9 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(
simple_assist_msg(""),
simple_assist_msg("Let's call something\n"),
common_chat_parse(
"Let's call something\n"
"<tool_call>{\"name",
@@ -788,8 +793,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_call_thoughts,
common_chat_parse(
@@ -979,7 +982,34 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
}));
assert_msg_equals(message_assist_thoughts,
common_chat_parse(
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
/* is_partial= */ true,
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
}));
assert_msg_equals(message_assist_thoughts_unparsed_md,
common_chat_parse(
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
/* is_partial= */ false,
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ true,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
}));
assert_msg_equals(message_assist_thoughts_unparsed_md_partial,
common_chat_parse(
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
/* is_partial= */ true,
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ true,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_unopened_unparsed,
@@ -989,8 +1019,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts,
common_chat_parse(
@@ -1073,6 +1101,13 @@ static void test_template_output_parsers() {
{COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
}
assert_equals(
message_assist_call,
common_chat_parse(
"<function=special_function>{\"arg1\": 1}<",
/* is_partial= */ true,
{COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
"<function=special_function>{\"arg1\": 1}</function>");
@@ -1187,8 +1222,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_unopened_unparsed,
common_chat_parse(
@@ -1197,8 +1230,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts,
common_chat_parse(
@@ -1252,8 +1283,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts,
common_chat_parse(
@@ -1295,8 +1324,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
"<tool▁calls▁begin><tool▁call▁begin>function<tool▁sep>special_function\n"
@@ -1356,8 +1383,7 @@ static void test_msg_diffs_compute() {
common_chat_msg_diff diff12;
diff12.tool_call_index = 0;
diff12.tool_call_delta.name = "special_function";
// Note: id doesnt change here.
// Note: neither id nor name change here.
diff12.tool_call_delta.arguments = "g1\": 1}";
assert_equals(
+2 -7
View File
@@ -364,6 +364,7 @@ struct server_task {
params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream;
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
}
{
@@ -3394,13 +3395,7 @@ struct server_context {
batch.logits + i,
};
int ret = 0;
if (do_encode) {
ret = llama_encode(ctx, batch_view);
} else {
ret = llama_decode(ctx, batch_view);
}
const int ret = llama_decode(ctx, batch_view);
metrics.on_decoded(slots);
@@ -121,6 +121,30 @@ def test_completion_stream_with_openai_library():
assert match_regex("(going|bed)+", output_text)
# Test case from https://github.com/ggml-org/llama.cpp/issues/13780
@pytest.mark.slow
def test_completion_stream_with_openai_library_stops():
global server
server.model_hf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M"
server.model_hf_file = None
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.completions.create(
model="davinci-002",
prompt="System: You are helpfull assistant.\nAssistant:\nHey! How could I help?\nUser:\nTell me a joke.\nAssistant:\n",
stop=["User:\n", "Assistant:\n"],
max_tokens=200,
stream=True,
)
output_text = ''
for data in res:
choice = data.choices[0]
if choice.finish_reason is None:
assert choice.text is not None
output_text += choice.text
assert match_regex("Sure, here's one for[\\s\\S]*", output_text), f'Unexpected output: {output_text}'
@pytest.mark.parametrize("n_slots", [1, 2])
def test_consistent_result_same_seed(n_slots: int):
global server
+6 -2
View File
@@ -328,6 +328,10 @@ class ServerProcess:
if 'function' not in tc:
raise ValueError(f"Expected function type, got {tc['type']}")
if tc['index'] >= len(tool_calls):
assert 'id' in tc
assert tc.get('type') == 'function'
assert 'function' in tc and 'name' in tc['function'] and len(tc['function']['name']) > 0, \
f"Expected function call with name, got {tc.get('function')}"
tool_calls.append(dict(
id="",
type="function",
@@ -340,10 +344,10 @@ class ServerProcess:
if tc.get('id') is not None:
tool_call['id'] = tc['id']
fct = tc['function']
assert 'id' not in fct, f"Function call should not have id: {fct}"
if fct.get('name') is not None:
tool_call['function']['name'] = fct['name']
tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
if fct.get('arguments') is not None:
assert len(fct['arguments']) > 0, f'Expected non empty arguments delta!'
tool_call['function']['arguments'] += fct['arguments']
print(f'Streamed response had {content_parts} content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
+5 -2
View File
@@ -735,8 +735,11 @@ static json oaicompat_chat_params_parse(
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
inputs.reasoning_format = opt.reasoning_format;
inputs.enable_thinking = opt.enable_thinking;
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
if (body.contains("grammar")) {
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
}
llama_params["parse_tool_calls"] = true;
}
// if the assistant message appears at the end of list, we do not add end-of-turn token