mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-30 09:37:42 +02:00
Compare commits
53 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 34b7c0439e | |||
| f3101a8cc6 | |||
| 1c49c70d07 | |||
| a8ea03d8ad | |||
| 05f6ac6283 | |||
| bc583e3c63 | |||
| 72b090da2c | |||
| 7fe03e7446 | |||
| 952f3953c1 | |||
| 81713121ee | |||
| f9cd68398b | |||
| 4f81b33e32 | |||
| cdf94a1802 | |||
| a26c4cc11e | |||
| 4265a87b59 | |||
| 6f180b915c | |||
| 03f582ae8f | |||
| 88c125f2ac | |||
| d74e94c1b3 | |||
| f13847cfb5 | |||
| 79c137f776 | |||
| 22229314fc | |||
| 9012eb9b45 | |||
| fef693dc6b | |||
| 2d38b6e400 | |||
| e121edc432 | |||
| 2f099b510f | |||
| aa50ba462f | |||
| de2ef53a4b | |||
| c508256db2 | |||
| 40aaa8a403 | |||
| a08c1d2845 | |||
| d785f9c1fd | |||
| 4032ca4066 | |||
| 515fdbf7ed | |||
| f5cd27b71d | |||
| a2d02d5793 | |||
| 17fc817b58 | |||
| 2bd1b30f69 | |||
| 259469c4b5 | |||
| 4c32832c59 | |||
| c3a2624339 | |||
| ffd0eae60b | |||
| b775345d78 | |||
| a70a8a69c2 | |||
| d13d0f6135 | |||
| 8a2afb7520 | |||
| 9ecf3e66a3 | |||
| faaaff5f94 | |||
| e16c4731c7 | |||
| 1dcd01960c | |||
| c10ed6cbcc | |||
| a127ff1780 |
@@ -260,16 +260,18 @@ jobs:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
shell: cmd
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -S . -B build -G "Ninja Multi-Config" `
|
||||
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake `
|
||||
-DGGML_NATIVE=OFF `
|
||||
-DGGML_BACKEND_DL=ON `
|
||||
-DGGML_CPU_ALL_VARIANTS=ON `
|
||||
-DGGML_OPENMP=OFF `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
|
||||
-DGGML_OPENMP=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release
|
||||
|
||||
@@ -279,6 +281,7 @@ jobs:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
name: Update Winget Package
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
- cron: '28 5 * * *' # Update every day at 5:28 UTC
|
||||
|
||||
jobs:
|
||||
update:
|
||||
name: Update Winget Package
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Install cargo binstall
|
||||
uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
|
||||
|
||||
- name: Install komac
|
||||
run: |
|
||||
cargo binstall komac@2.11.2 -y
|
||||
|
||||
- name: Find latest release
|
||||
id: find_latest_release
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
const { data: releases } = await github.rest.repos.listReleases({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
});
|
||||
console.log("Latest release:", releases[0].tag_name);
|
||||
return releases[0].tag_name;
|
||||
|
||||
- name: Update manifest
|
||||
env:
|
||||
VERSION: ${{ steps.find_latest_release.outputs.result }}
|
||||
run: |
|
||||
echo "Updating manifest..."
|
||||
komac update --version ${{ env.VERSION }} \
|
||||
--urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
|
||||
--token ${{ secrets.WINGET_GITHUB_TOKEN }} \
|
||||
--submit \
|
||||
ggml.llamacpp
|
||||
@@ -60,12 +60,16 @@ add_library(${TARGET} STATIC
|
||||
base64.hpp
|
||||
chat.cpp
|
||||
chat.h
|
||||
chat-parser.cpp
|
||||
chat-parser.h
|
||||
common.cpp
|
||||
common.h
|
||||
console.cpp
|
||||
console.h
|
||||
json-schema-to-grammar.cpp
|
||||
json.hpp
|
||||
json-partial.h
|
||||
json-partial.cpp
|
||||
llguidance.cpp
|
||||
log.cpp
|
||||
log.h
|
||||
|
||||
+140
-111
@@ -242,7 +242,56 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
||||
}
|
||||
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
||||
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
|
||||
// Check if the file already exists locally
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
|
||||
if (file_exists) {
|
||||
if (offline) {
|
||||
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
||||
return true; // skip verification/downloading
|
||||
}
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
} else {
|
||||
if (offline) {
|
||||
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
|
||||
// Initialize libcurl
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
@@ -269,91 +318,47 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
|
||||
// Check if the file already exists locally
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
|
||||
if (file_exists) {
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
return n_items;
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
// get ETag to see if the remote file has changed
|
||||
{
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
// we only allow retrying once for HEAD requests
|
||||
// this is for the use case of using running offline (no internet), retrying can be annoying
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
||||
if (!was_perform_successful) {
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
}
|
||||
}
|
||||
return n_items;
|
||||
};
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
// we only allow retrying once for HEAD requests
|
||||
// this is for the use case of using running offline (no internet), retrying can be annoying
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
||||
if (!was_perform_successful) {
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||
@@ -460,12 +465,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
|
||||
// download multiple files from remote URLs to local paths
|
||||
// the input is a vector of pairs <url, path>
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
||||
// Prepare download in parallel
|
||||
std::vector<std::future<bool>> futures_download;
|
||||
for (auto const & item : urls) {
|
||||
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token);
|
||||
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
||||
}, item));
|
||||
}
|
||||
|
||||
@@ -481,14 +486,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
||||
|
||||
static bool common_download_model(
|
||||
const common_params_model & model,
|
||||
const std::string & bearer_token) {
|
||||
const std::string & bearer_token,
|
||||
bool offline) {
|
||||
// Basic validation of the model.url
|
||||
if (model.url.empty()) {
|
||||
LOG_ERR("%s: invalid model url\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -547,7 +553,7 @@ static bool common_download_model(
|
||||
}
|
||||
|
||||
// Download in parallel
|
||||
common_download_file_multiple(urls, bearer_token);
|
||||
common_download_file_multiple(urls, bearer_token, offline);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -608,7 +614,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||
*
|
||||
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
||||
*/
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||
std::string hf_repo = parts[0];
|
||||
@@ -638,20 +644,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
long res_code = 0;
|
||||
std::string res_str;
|
||||
bool use_cache = false;
|
||||
try {
|
||||
auto res = common_remote_get_content(url, params);
|
||||
res_code = res.first;
|
||||
res_str = std::string(res.second.data(), res.second.size());
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
||||
LOG_WRN("try reading from cache\n");
|
||||
// try to read from cache
|
||||
if (!offline) {
|
||||
try {
|
||||
auto res = common_remote_get_content(url, params);
|
||||
res_code = res.first;
|
||||
res_str = std::string(res.second.data(), res.second.size());
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
if (res_code == 0) {
|
||||
if (std::filesystem::exists(cached_response_path)) {
|
||||
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
|
||||
res_str = read_file(cached_response_path);
|
||||
res_code = 200;
|
||||
use_cache = true;
|
||||
} catch (const std::exception & e) {
|
||||
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
offline ? "error: failed to get manifest (offline mode)"
|
||||
: "error: failed to get manifest (check your internet connection)");
|
||||
}
|
||||
}
|
||||
std::string ggufFile;
|
||||
@@ -698,24 +709,25 @@ bool common_has_curl() {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_model(
|
||||
const common_params_model &,
|
||||
const std::string &) {
|
||||
const std::string &,
|
||||
bool) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return {};
|
||||
}
|
||||
@@ -742,7 +754,8 @@ struct handle_model_result {
|
||||
static handle_model_result common_params_handle_model(
|
||||
struct common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
const std::string & model_path_default) {
|
||||
const std::string & model_path_default,
|
||||
bool offline) {
|
||||
handle_model_result result;
|
||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||
{
|
||||
@@ -750,7 +763,7 @@ static handle_model_result common_params_handle_model(
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
if (model.hf_file.empty()) {
|
||||
if (model.path.empty()) {
|
||||
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
||||
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
||||
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
||||
exit(1); // built without CURL, error message already printed
|
||||
}
|
||||
@@ -791,7 +804,7 @@ static handle_model_result common_params_handle_model(
|
||||
|
||||
// then, download it if needed
|
||||
if (!model.url.empty()) {
|
||||
bool ok = common_download_model(model, bearer_token);
|
||||
bool ok = common_download_model(model, bearer_token, offline);
|
||||
if (!ok) {
|
||||
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
||||
exit(1);
|
||||
@@ -934,7 +947,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
|
||||
// handle model and download
|
||||
{
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
@@ -944,12 +957,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
// only download mmproj if the current example is using it
|
||||
for (auto & ex : mmproj_examples) {
|
||||
if (ctx_arg.ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "");
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
|
||||
break;
|
||||
}
|
||||
}
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
|
||||
}
|
||||
|
||||
if (params.escape) {
|
||||
@@ -2848,15 +2861,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-format"}, "FORMAT",
|
||||
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
|
||||
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
|
||||
"only supported for non-streamed responses",
|
||||
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
||||
"- none: leaves thoughts unparsed in `message.content`\n"
|
||||
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
||||
"(default: deepseek)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
||||
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
||||
else { std::invalid_argument("invalid value"); }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-budget"}, "N",
|
||||
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
||||
[](common_params & params, int value) {
|
||||
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
||||
params.reasoning_budget = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template"}, "JINJA_TEMPLATE",
|
||||
string_format(
|
||||
@@ -2955,7 +2977,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
||||
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
||||
else { std::invalid_argument("invalid value"); }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
||||
add_opt(common_arg(
|
||||
@@ -2987,6 +3009,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
common_log_set_verbosity_thold(INT_MAX);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--offline"},
|
||||
"Offline mode: forces use of cache, prevents network access",
|
||||
[](common_params & params) {
|
||||
params.offline = true;
|
||||
}
|
||||
).set_env("LLAMA_OFFLINE"));
|
||||
add_opt(common_arg(
|
||||
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
||||
|
||||
@@ -0,0 +1,379 @@
|
||||
#include "chat-parser.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "regex-partial.h"
|
||||
|
||||
#include <optional>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
|
||||
: input_(input), is_partial_(is_partial), syntax_(syntax)
|
||||
{
|
||||
result_.role = "assistant";
|
||||
|
||||
while (true) {
|
||||
std::string id = std::to_string(std::rand());
|
||||
if (input.find(id) == std::string::npos) {
|
||||
healing_marker_ = id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string common_chat_msg_parser::str(const common_string_range & rng) const {
|
||||
GGML_ASSERT(rng.begin <= rng.end);
|
||||
return input_.substr(rng.begin, rng.end - rng.begin);
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::add_content(const std::string &content) {
|
||||
result_.content += content;
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
|
||||
result_.reasoning_content += reasoning_content;
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
|
||||
if (name.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
common_chat_tool_call tool_call;
|
||||
tool_call.name = name;
|
||||
tool_call.arguments = arguments;
|
||||
tool_call.id = id;
|
||||
|
||||
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
||||
result_.tool_calls.emplace_back(tool_call);
|
||||
return true;
|
||||
}
|
||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
|
||||
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
|
||||
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
|
||||
return add_tool_call(name, id, arguments);
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::add_tool_calls(const json & arr) {
|
||||
for (const auto & item : arr) {
|
||||
if (!add_tool_call(item)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void common_chat_msg_parser::finish() {
|
||||
if (!is_partial_ && pos_ != input_.size()) {
|
||||
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
|
||||
}
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::consume_spaces() {
|
||||
const auto length = input_.size();
|
||||
auto consumed = false;
|
||||
while (pos_ < length && std::isspace(input_[pos_])) {
|
||||
++pos_;
|
||||
consumed = true;
|
||||
}
|
||||
return consumed;
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
|
||||
auto pos = pos_;
|
||||
for (auto i = 0u; i < literal.size(); ++i) {
|
||||
if (pos >= input_.size()) {
|
||||
return false;
|
||||
}
|
||||
if (input_[pos] != literal[i]) {
|
||||
return false;
|
||||
}
|
||||
++pos;
|
||||
}
|
||||
pos_ = pos;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_literal(const std::string & literal) {
|
||||
auto idx = input_.find(literal, pos_);
|
||||
if (idx != std::string::npos) {
|
||||
find_regex_result res;
|
||||
res.prelude = input_.substr(pos_, idx - pos_);
|
||||
auto end = idx + literal.size();
|
||||
res.groups.emplace_back(common_string_range{idx, end});
|
||||
move_to(end);
|
||||
return res;
|
||||
}
|
||||
if (is_partial_) {
|
||||
idx = string_find_partial_stop(input_, literal);
|
||||
if (idx != std::string::npos && idx >= pos_) {
|
||||
find_regex_result res;
|
||||
res.prelude = input_.substr(pos_, idx - pos_);
|
||||
auto end = input_.size();
|
||||
res.groups.emplace_back(common_string_range{idx, end});
|
||||
move_to(end);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::consume_literal(const std::string & literal) {
|
||||
if (!try_consume_literal(literal)) {
|
||||
throw common_chat_msg_partial_exception(literal);
|
||||
}
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
|
||||
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
|
||||
auto stripped_reasoning = string_strip(reasoning);
|
||||
if (stripped_reasoning.empty()) {
|
||||
return;
|
||||
}
|
||||
if (syntax_.reasoning_in_content) {
|
||||
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
|
||||
add_content(stripped_reasoning);
|
||||
if (closed) {
|
||||
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
|
||||
}
|
||||
} else {
|
||||
add_reasoning_content(stripped_reasoning);
|
||||
}
|
||||
};
|
||||
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
|
||||
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
|
||||
if (auto res = try_find_literal(end_think)) {
|
||||
handle_reasoning(res->prelude, /* closed */ true);
|
||||
consume_spaces();
|
||||
return true;
|
||||
}
|
||||
auto rest = consume_rest();
|
||||
if (!rest.empty()) {
|
||||
handle_reasoning(rest, /* closed */ !is_partial());
|
||||
}
|
||||
if (!syntax_.thinking_forced_open) {
|
||||
throw common_chat_msg_partial_exception(end_think);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string common_chat_msg_parser::consume_rest() {
|
||||
auto rest = input_.substr(pos_);
|
||||
pos_ = input_.size();
|
||||
return rest;
|
||||
}
|
||||
|
||||
// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
|
||||
auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
|
||||
pos_ = m.groups[0].end;
|
||||
|
||||
if (add_prelude_to_content) {
|
||||
add_content(prelude);
|
||||
}
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
|
||||
if (is_partial()) {
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
return find_regex_result{prelude, m.groups};
|
||||
}
|
||||
|
||||
common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
|
||||
if (auto result = try_consume_regex(regex)) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
|
||||
auto m = regex.search(input_, pos_);
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
|
||||
if (is_partial()) {
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
if (m.groups[0].begin != pos_) {
|
||||
// Didn't match at the current position.
|
||||
return std::nullopt;
|
||||
}
|
||||
pos_ = m.groups[0].end;
|
||||
|
||||
return find_regex_result {
|
||||
/* .prelude = */ "",
|
||||
m.groups,
|
||||
};
|
||||
}
|
||||
|
||||
std::optional<common_json> common_chat_msg_parser::try_consume_json() {
|
||||
auto it = input_.cbegin() + pos_;
|
||||
const auto end = input_.cend();
|
||||
common_json result;
|
||||
if (!common_json_parse(it, end, healing_marker_, result)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
pos_ = std::distance(input_.cbegin(), it);
|
||||
if (result.healing_marker.marker.empty()) {
|
||||
// No healing marker, just return the parsed json
|
||||
return result;
|
||||
}
|
||||
if (!is_partial()) {
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
common_json common_chat_msg_parser::consume_json() {
|
||||
if (auto result = try_consume_json()) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
|
||||
common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths,
|
||||
const std::vector<std::vector<std::string>> & content_paths
|
||||
) {
|
||||
if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths,
|
||||
const std::vector<std::vector<std::string>> & content_paths
|
||||
) {
|
||||
auto partial = try_consume_json();
|
||||
if (!partial) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto is_arguments_path = [&](const std::vector<std::string> & path) {
|
||||
return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
|
||||
};
|
||||
auto is_content_path = [&](const std::vector<std::string> & path) {
|
||||
return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
|
||||
};
|
||||
|
||||
if (partial->healing_marker.marker.empty()) {
|
||||
if (args_paths.empty()) {
|
||||
// No arguments to dump, and JSON was parsed fully.
|
||||
return consume_json_result {
|
||||
partial->json,
|
||||
/* .is_partial = */ false,
|
||||
};
|
||||
}
|
||||
if (is_arguments_path({})) {
|
||||
// Entire JSON is the arguments and was parsed fully.
|
||||
return consume_json_result {
|
||||
partial->json.dump(),
|
||||
/* .is_partial = */ false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
|
||||
|
||||
auto found_healing_marker = false;
|
||||
std::vector<std::string> path;
|
||||
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
|
||||
if (is_arguments_path(path)) {
|
||||
auto arguments = j.dump();
|
||||
if (is_partial() && !partial->healing_marker.marker.empty()) {
|
||||
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
|
||||
if (idx != std::string::npos) {
|
||||
arguments.resize(idx);
|
||||
found_healing_marker = true;
|
||||
}
|
||||
if (arguments == "\"") {
|
||||
// This happens because of completing `:"$magic` after `"arguments"`
|
||||
arguments = "";
|
||||
}
|
||||
}
|
||||
return arguments;
|
||||
}
|
||||
if (is_content_path(path)) {
|
||||
if (!j.is_string()) {
|
||||
throw std::runtime_error("Content path must be a string");
|
||||
}
|
||||
std::string str = j;
|
||||
auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
|
||||
if (idx != std::string::npos) {
|
||||
str.resize(idx);
|
||||
found_healing_marker = true;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
if (j.is_object()) {
|
||||
auto obj = json::object();
|
||||
for (const auto & p : j.items()) {
|
||||
const auto & key = p.key();
|
||||
const auto & value = p.value();
|
||||
const std::string key_str = key; // NOLINT
|
||||
auto idx = key_str.find(healing_marker_);
|
||||
if (idx != std::string::npos) {
|
||||
found_healing_marker = true;
|
||||
break;
|
||||
}
|
||||
path.push_back(key_str);
|
||||
if (value.is_string()) {
|
||||
const std::string value_str = value;
|
||||
if (value_str.find(healing_marker_) != std::string::npos) {
|
||||
found_healing_marker = true;
|
||||
if (is_content_path(path)) {
|
||||
if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
|
||||
// The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
|
||||
obj[key] = remove_unsupported_healings_and_dump_args(value);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
obj[key] = value;
|
||||
} else {
|
||||
obj[key] = remove_unsupported_healings_and_dump_args(value);
|
||||
}
|
||||
path.pop_back();
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
if (j.is_array()) {
|
||||
auto arr = json::array();
|
||||
for (const auto & value : j) {
|
||||
if (value.is_string()) {
|
||||
std::string str = value;
|
||||
auto idx = str.find(healing_marker_);
|
||||
if (idx != std::string::npos) {
|
||||
// Don't heal array values that aren't in the arguments.
|
||||
found_healing_marker = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
arr.push_back(remove_unsupported_healings_and_dump_args(value));
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
return j;
|
||||
};
|
||||
|
||||
auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
|
||||
LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
|
||||
return consume_json_result {
|
||||
cleaned,
|
||||
/* .is_partial = */ found_healing_marker,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
#pragma once
|
||||
|
||||
#include "chat.h"
|
||||
#include "json-partial.h"
|
||||
#include "json.hpp"
|
||||
#include "regex-partial.h"
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class common_chat_msg_partial_exception : public std::runtime_error {
|
||||
public:
|
||||
common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
|
||||
};
|
||||
|
||||
class common_chat_msg_parser {
|
||||
std::string input_;
|
||||
bool is_partial_;
|
||||
common_chat_syntax syntax_;
|
||||
std::string healing_marker_;
|
||||
|
||||
size_t pos_ = 0;
|
||||
common_chat_msg result_;
|
||||
|
||||
public:
|
||||
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
const std::string & input() const { return input_; }
|
||||
size_t pos() const { return pos_; }
|
||||
const std::string & healing_marker() const { return healing_marker_; }
|
||||
const bool & is_partial() const { return is_partial_; }
|
||||
const common_chat_msg & result() const { return result_; }
|
||||
const common_chat_syntax & syntax() const { return syntax_; }
|
||||
|
||||
void move_to(size_t pos) {
|
||||
if (pos > input_.size()) {
|
||||
throw std::runtime_error("Invalid position!");
|
||||
}
|
||||
pos_ = pos;
|
||||
}
|
||||
void move_back(size_t n) {
|
||||
if (pos_ < n) {
|
||||
throw std::runtime_error("Can't move back that far!");
|
||||
}
|
||||
pos_ -= n;
|
||||
}
|
||||
|
||||
// Get the substring of the input at the given range
|
||||
std::string str(const common_string_range & rng) const;
|
||||
|
||||
// Appends to the result.content field
|
||||
void add_content(const std::string & content);
|
||||
|
||||
// Appends to the result.reasoning_content field
|
||||
void add_reasoning_content(const std::string & reasoning_content);
|
||||
|
||||
// Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
|
||||
bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
|
||||
|
||||
// Adds a tool call using the "name", "id" and "arguments" fields of the json object
|
||||
bool add_tool_call(const nlohmann::ordered_json & tool_call);
|
||||
|
||||
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
|
||||
bool add_tool_calls(const nlohmann::ordered_json & arr);
|
||||
|
||||
void finish();
|
||||
|
||||
bool consume_spaces();
|
||||
|
||||
void consume_literal(const std::string & literal);
|
||||
|
||||
bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
|
||||
|
||||
std::string consume_rest();
|
||||
|
||||
struct find_regex_result {
|
||||
std::string prelude;
|
||||
std::vector<common_string_range> groups;
|
||||
};
|
||||
|
||||
std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
|
||||
|
||||
bool try_consume_literal(const std::string & literal);
|
||||
|
||||
std::optional<find_regex_result> try_find_literal(const std::string & literal);
|
||||
|
||||
find_regex_result consume_regex(const common_regex & regex);
|
||||
|
||||
std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
|
||||
|
||||
std::optional<common_json> try_consume_json();
|
||||
common_json consume_json();
|
||||
|
||||
struct consume_json_result {
|
||||
nlohmann::ordered_json value;
|
||||
bool is_partial;
|
||||
};
|
||||
|
||||
/*
|
||||
Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
|
||||
|
||||
By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
|
||||
e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
|
||||
|
||||
But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
|
||||
- with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
|
||||
- with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
|
||||
*/
|
||||
consume_json_result consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
std::optional<consume_json_result> try_consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
};
|
||||
+731
-603
File diff suppressed because it is too large
Load Diff
+71
-6
@@ -3,6 +3,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include <functional>
|
||||
#include <chrono>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -13,11 +14,19 @@ struct common_chat_tool_call {
|
||||
std::string name;
|
||||
std::string arguments;
|
||||
std::string id;
|
||||
|
||||
bool operator==(const common_chat_tool_call & other) const {
|
||||
return name == other.name && arguments == other.arguments && id == other.id;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg_content_part {
|
||||
std::string type;
|
||||
std::string text;
|
||||
|
||||
bool operator==(const common_chat_msg_content_part & other) const {
|
||||
return type == other.type && text == other.text;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg {
|
||||
@@ -28,6 +37,51 @@ struct common_chat_msg {
|
||||
std::string reasoning_content;
|
||||
std::string tool_name;
|
||||
std::string tool_call_id;
|
||||
|
||||
template <class T> T to_json_oaicompat() const;
|
||||
|
||||
bool empty() const {
|
||||
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
||||
}
|
||||
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
|
||||
for (auto i = 0u; i < tool_calls.size(); i++) {
|
||||
if (ids_cache.size() <= i) {
|
||||
auto id = tool_calls[i].id;
|
||||
if (id.empty()) {
|
||||
id = gen_tool_call_id();
|
||||
}
|
||||
ids_cache.push_back(id);
|
||||
}
|
||||
tool_calls[i].id = ids_cache[i];
|
||||
}
|
||||
}
|
||||
bool operator==(const common_chat_msg & other) const {
|
||||
return role == other.role
|
||||
&& content == other.content
|
||||
&& content_parts == other.content_parts
|
||||
&& tool_calls == other.tool_calls
|
||||
&& reasoning_content == other.reasoning_content
|
||||
&& tool_name == other.tool_name
|
||||
&& tool_call_id == other.tool_call_id;
|
||||
}
|
||||
bool operator!=(const common_chat_msg & other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg_diff {
|
||||
// std::string reasoning_content_delta;
|
||||
std::string content_delta;
|
||||
size_t tool_call_index = std::string::npos;
|
||||
common_chat_tool_call tool_call_delta;
|
||||
|
||||
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
|
||||
|
||||
bool operator==(const common_chat_msg_diff & other) const {
|
||||
return content_delta == other.content_delta
|
||||
&& tool_call_index == other.tool_call_index
|
||||
&& tool_call_delta == other.tool_call_delta;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_tool {
|
||||
@@ -49,14 +103,11 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
||||
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
|
||||
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
|
||||
|
||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||
};
|
||||
@@ -71,7 +122,8 @@ struct common_chat_templates_inputs {
|
||||
std::vector<common_chat_tool> tools;
|
||||
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
bool parallel_tool_calls = false;
|
||||
bool extract_reasoning = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
};
|
||||
|
||||
@@ -80,11 +132,21 @@ struct common_chat_params {
|
||||
std::string prompt;
|
||||
std::string grammar;
|
||||
bool grammar_lazy = false;
|
||||
bool thinking_forced_open = false;
|
||||
std::vector<common_grammar_trigger> grammar_triggers;
|
||||
std::vector<std::string> preserved_tokens;
|
||||
std::vector<std::string> additional_stops;
|
||||
};
|
||||
|
||||
struct common_chat_syntax {
|
||||
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
|
||||
bool reasoning_in_content = false;
|
||||
bool thinking_forced_open = false;
|
||||
bool parse_tool_calls = true;
|
||||
};
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
||||
|
||||
@@ -121,8 +183,9 @@ std::string common_chat_format_example(
|
||||
const struct common_chat_templates * tmpls,
|
||||
bool use_jinja);
|
||||
|
||||
std::string common_chat_format_name(common_chat_format format);
|
||||
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
|
||||
const char* common_chat_format_name(common_chat_format format);
|
||||
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||
|
||||
@@ -135,3 +198,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
|
||||
// T can be std::string containing JSON or nlohmann::ordered_json
|
||||
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
|
||||
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
||||
|
||||
template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
||||
|
||||
+1
-1
@@ -849,7 +849,7 @@ std::string fs_get_cache_directory() {
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
} else {
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||
} else {
|
||||
|
||||
+3
-1
@@ -115,7 +115,7 @@ enum common_grammar_trigger_type {
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
||||
};
|
||||
|
||||
struct common_grammar_trigger {
|
||||
@@ -291,6 +291,7 @@ struct common_params {
|
||||
int32_t verbosity = 0;
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
bool offline = false;
|
||||
|
||||
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
@@ -368,6 +369,7 @@ struct common_params {
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int reasoning_budget = -1;
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
@@ -0,0 +1,255 @@
|
||||
#include <json-partial.h>
|
||||
#include "ggml.h"
|
||||
#include "log.h"
|
||||
#include <string>
|
||||
|
||||
#include <json.hpp>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum common_json_stack_element_type {
|
||||
COMMON_JSON_STACK_ELEMENT_OBJECT,
|
||||
COMMON_JSON_STACK_ELEMENT_KEY,
|
||||
COMMON_JSON_STACK_ELEMENT_ARRAY,
|
||||
};
|
||||
|
||||
struct common_json_stack_element {
|
||||
common_json_stack_element_type type;
|
||||
std::string key;
|
||||
};
|
||||
|
||||
bool common_json_parse(
|
||||
const std::string & input,
|
||||
const std::string & healing_marker,
|
||||
common_json & out)
|
||||
{
|
||||
std::string::const_iterator it = input.begin();
|
||||
const auto end = input.end();
|
||||
return common_json_parse(it, end, healing_marker, out);
|
||||
}
|
||||
|
||||
bool common_json_parse(
|
||||
std::string::const_iterator & it,
|
||||
const std::string::const_iterator & end,
|
||||
const std::string & healing_marker,
|
||||
common_json & out)
|
||||
{
|
||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
||||
std::size_t position;
|
||||
bool found_error;
|
||||
std::string last_token;
|
||||
std::string exception_message;
|
||||
std::vector<common_json_stack_element> stack;
|
||||
|
||||
json_error_locator() : position(0), found_error(false) {}
|
||||
|
||||
bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
|
||||
this->position = position - 1;
|
||||
this->found_error = true;
|
||||
this->last_token = last_token;
|
||||
this->exception_message = ex.what();
|
||||
return false;
|
||||
}
|
||||
void close_value() {
|
||||
if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
|
||||
stack.pop_back();
|
||||
}
|
||||
}
|
||||
bool null() override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool boolean(bool) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_integer(number_integer_t) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_unsigned(number_unsigned_t) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_float(number_float_t, const string_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool string(string_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool binary(binary_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool start_object(std::size_t) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
|
||||
return true;
|
||||
}
|
||||
bool end_object() override {
|
||||
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
|
||||
stack.pop_back();
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool key(string_t & key) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
|
||||
return true;
|
||||
}
|
||||
bool start_array(std::size_t) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
|
||||
return true;
|
||||
}
|
||||
bool end_array() override {
|
||||
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
|
||||
stack.pop_back();
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
};
|
||||
json_error_locator err_loc;
|
||||
auto start = it;
|
||||
json::sax_parse(it, end, &err_loc);
|
||||
|
||||
if (err_loc.found_error) {
|
||||
it = start;
|
||||
auto temptative_end = it + err_loc.position;
|
||||
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
|
||||
|
||||
auto input = std::string(it, temptative_end);
|
||||
try {
|
||||
out.json = json::parse(input);
|
||||
// out.json = json::parse(it, temptative_end);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
} catch (const std::exception & ex) {
|
||||
// No, needs healing.
|
||||
LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
|
||||
}
|
||||
auto can_parse = [](const std::string & str) {
|
||||
try {
|
||||
auto _ = json::parse(str); // NOLINT
|
||||
return true;
|
||||
} catch (const std::exception &) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
if (!healing_marker.empty() && !err_loc.stack.empty()) {
|
||||
std::string str(it, temptative_end);
|
||||
auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
|
||||
if (last_non_sp_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||
}
|
||||
auto last_non_sp_char = str[last_non_sp_pos];
|
||||
// Used to detect stops on a number, which may not be complete.
|
||||
auto was_maybe_number = [&]() {
|
||||
if (!str.empty() && std::isspace(str.back())) {
|
||||
return false;
|
||||
}
|
||||
return std::isdigit(last_non_sp_char) ||
|
||||
last_non_sp_char == '.' ||
|
||||
last_non_sp_char == 'e' ||
|
||||
last_non_sp_char == 'E' ||
|
||||
last_non_sp_char == '-';
|
||||
};
|
||||
|
||||
std::string closing;
|
||||
for (size_t i = err_loc.stack.size(); i > 0; i--) {
|
||||
auto & el = err_loc.stack[i - 1];
|
||||
if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||
closing += "}";
|
||||
} else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||
closing += "]";
|
||||
} else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||
throw std::runtime_error("Unexpected stack element type");
|
||||
}
|
||||
}
|
||||
|
||||
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
|
||||
|
||||
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||
// We're inside an object value
|
||||
if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
|
||||
// Was about to create an object value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + ": 1" + closing)) {
|
||||
str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
|
||||
} else if (last_non_sp_char == '{' && can_parse(str + closing)) {
|
||||
// Was about to create an object
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + "\"" + closing)) {
|
||||
// Was inside an object value string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||
// Was inside an object value string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||
} else {
|
||||
// find last :
|
||||
auto last_pos = str.find_last_of(':');
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||
}
|
||||
// Cutting back to opening : for object value
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||
if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
|
||||
// Was about to create an array value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + "\"" + closing)) {
|
||||
// Was inside an array value string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||
// Was inside an array value string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
|
||||
// Had just finished a value
|
||||
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
|
||||
} else {
|
||||
auto last_pos = str.find_last_of("[,");
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
|
||||
}
|
||||
// Cutting back to last [ or , for array value
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||
if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
|
||||
(last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
|
||||
// Was about to create an object key+value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
|
||||
// Was about to create an object key+value
|
||||
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + "\": 1" + closing)) {
|
||||
// Was inside an object key string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
|
||||
// Was inside an object key string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
|
||||
} else {
|
||||
auto last_pos = str.find_last_of(':');
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||
}
|
||||
// fprintf(stderr, "Cutting back to last : for object key+value\n");
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||
}
|
||||
// fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
|
||||
out.json = json::parse(str);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
}
|
||||
// TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
|
||||
// fprintf(stderr, "Closing: TODO\n");
|
||||
return false;
|
||||
}
|
||||
out.json = json::parse(it, end);
|
||||
it = end;
|
||||
return true;
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
#pragma once
|
||||
#include <json.hpp>
|
||||
|
||||
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
|
||||
struct common_healing_marker {
|
||||
// Raw marker.
|
||||
std::string marker;
|
||||
|
||||
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
|
||||
std::string json_dump_marker;
|
||||
};
|
||||
|
||||
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
|
||||
struct common_json {
|
||||
nlohmann::ordered_json json;
|
||||
|
||||
common_healing_marker healing_marker;
|
||||
};
|
||||
|
||||
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
|
||||
//
|
||||
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
|
||||
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
|
||||
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
|
||||
//
|
||||
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
|
||||
bool common_json_parse(
|
||||
const std::string & input,
|
||||
const std::string & healing_marker,
|
||||
common_json & out);
|
||||
|
||||
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
|
||||
bool common_json_parse(
|
||||
std::string::const_iterator & it,
|
||||
const std::string::const_iterator & end,
|
||||
const std::string & healing_marker,
|
||||
common_json & out);
|
||||
+7
-8
@@ -161,7 +161,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
} else {
|
||||
std::vector<std::string> patterns_at_start;
|
||||
std::vector<std::string> trigger_patterns;
|
||||
std::vector<std::string> patterns_anywhere;
|
||||
std::vector<llama_token> trigger_tokens;
|
||||
for (const auto & trigger : params.grammar_triggers) {
|
||||
@@ -173,10 +173,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
|
||||
{
|
||||
const auto & pattern = trigger.value;
|
||||
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
|
||||
patterns_anywhere.push_back(trigger.value);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
||||
{
|
||||
trigger_patterns.push_back(trigger.value);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
||||
@@ -190,10 +193,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> trigger_patterns;
|
||||
if (!patterns_at_start.empty()) {
|
||||
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
if (!patterns_anywhere.empty()) {
|
||||
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
|
||||
+162
-42
@@ -432,6 +432,9 @@ class ModelBase:
|
||||
if "llm_config" in config:
|
||||
# rename for InternVL
|
||||
config["text_config"] = config["llm_config"]
|
||||
if "thinker_config" in config:
|
||||
# rename for Qwen2.5-Omni
|
||||
config["text_config"] = config["thinker_config"]["text_config"]
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
@@ -1121,18 +1124,21 @@ class MmprojModel(ModelBase):
|
||||
preprocessor_config: dict[str, Any]
|
||||
global_config: dict[str, Any]
|
||||
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
||||
|
||||
has_vision_encoder: bool = True # by default
|
||||
has_audio_encoder: bool = False
|
||||
|
||||
# for models having multiple encoders, we need to separate their hparams
|
||||
hparams_vision: dict[str, Any] | None = None
|
||||
hparams_audio: dict[str, Any] | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
|
||||
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
|
||||
|
||||
if self.has_vision_encoder and self.has_audio_encoder:
|
||||
raise NotImplementedError("both vision + audio not supported yet")
|
||||
|
||||
# get n_embd of the text model
|
||||
if "text_config" not in self.hparams:
|
||||
self.hparams["text_config"] = {}
|
||||
@@ -1143,22 +1149,32 @@ class MmprojModel(ModelBase):
|
||||
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
||||
|
||||
# move vision config to the top level, while preserving the original hparams in global_config
|
||||
self.global_config = self.hparams
|
||||
import copy
|
||||
self.global_config = copy.deepcopy(self.hparams)
|
||||
self.hparams_vision = self.get_vision_config()
|
||||
self.hparams_audio = self.get_audio_config()
|
||||
|
||||
if "vision_config" in self.hparams:
|
||||
self.hparams = self.hparams["vision_config"]
|
||||
elif "audio_config" in self.hparams:
|
||||
self.hparams = self.hparams["audio_config"]
|
||||
else:
|
||||
if self.hparams_vision is None and self.hparams_audio is None:
|
||||
raise ValueError("vision_config / audio_config not found in hparams")
|
||||
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
|
||||
# for compat with vision-only models
|
||||
self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
|
||||
|
||||
# TODO @ngxson : this is a hack to support both vision and audio encoders
|
||||
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
|
||||
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
|
||||
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
|
||||
|
||||
# load preprocessor config
|
||||
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
|
||||
self.preprocessor_config = json.load(f)
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("vision_config")
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("audio_config")
|
||||
|
||||
def set_type(self):
|
||||
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
||||
|
||||
@@ -1170,26 +1186,26 @@ class MmprojModel(ModelBase):
|
||||
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
|
||||
|
||||
# vision config
|
||||
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
|
||||
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.block_count)
|
||||
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
|
||||
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
|
||||
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
|
||||
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
|
||||
|
||||
# preprocessor config
|
||||
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
|
||||
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
|
||||
|
||||
elif self.has_audio_encoder:
|
||||
if self.has_audio_encoder:
|
||||
self.gguf_writer.add_clip_has_audio_encoder(True)
|
||||
self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
|
||||
|
||||
# audio config
|
||||
self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
|
||||
self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_audio_block_count(self.block_count)
|
||||
self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
|
||||
self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
|
||||
self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
|
||||
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
|
||||
|
||||
else:
|
||||
raise ValueError("MmprojModel must have either vision or audio encoder")
|
||||
@@ -1197,6 +1213,22 @@ class MmprojModel(ModelBase):
|
||||
def write_vocab(self):
|
||||
raise ValueError("MmprojModel does not support vocab writing")
|
||||
|
||||
def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
||||
assert self.hparams_vision is not None
|
||||
return self._find_param(self.hparams_vision, keys, optional)
|
||||
|
||||
def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
||||
assert self.hparams_audio is not None
|
||||
return self._find_param(self.hparams_audio, keys, optional)
|
||||
|
||||
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
|
||||
key = next((k for k in keys if k in obj), None)
|
||||
if key is not None:
|
||||
return obj[key]
|
||||
if optional:
|
||||
return None
|
||||
raise KeyError(f"could not find any of: {keys}")
|
||||
|
||||
|
||||
@ModelBase.register("GPTNeoXForCausalLM")
|
||||
class GPTNeoXModel(TextModel):
|
||||
@@ -2643,7 +2675,7 @@ class QwenModel(TextModel):
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
|
||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
||||
class Qwen2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||
|
||||
@@ -2667,13 +2699,19 @@ class Qwen2Model(TextModel):
|
||||
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
|
||||
if "language_model." in name:
|
||||
name = name.replace("language_model.", "") # for InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model"):
|
||||
# skip visual tensors
|
||||
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
|
||||
or name.startswith("vision_model") or name.startswith("audio_tower"):
|
||||
# skip vision and audio tensors
|
||||
return []
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
@ModelBase.register(
|
||||
"Qwen2VLModel",
|
||||
"Qwen2VLForConditionalGeneration",
|
||||
"Qwen2_5_VLForConditionalGeneration",
|
||||
"Qwen2_5OmniModel",
|
||||
)
|
||||
class Qwen2VLModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
||||
|
||||
@@ -2691,8 +2729,11 @@ class Qwen2VLModel(TextModel):
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("visual."):
|
||||
# skip visual tensors
|
||||
if name.startswith("thinker."):
|
||||
name = name.replace("thinker.", "")
|
||||
if name.startswith("visual") or name.startswith("audio") or \
|
||||
name.startswith("talker") or name.startswith("token2wav"):
|
||||
# skip multimodal tensors
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
@@ -2701,21 +2742,27 @@ class Qwen2VLModel(TextModel):
|
||||
class Qwen2VLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["image_size"] = self.hparams.get("image_size", 560)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
|
||||
# rename config.json values
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
|
||||
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
|
||||
if "embed_dim" in self.hparams: # qwen2vl
|
||||
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
|
||||
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
|
||||
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
|
||||
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
|
||||
if "embed_dim" in self.hparams_vision: # qwen2vl
|
||||
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
|
||||
self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if self.global_config['model_type'] == 'qwen2_vl':
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
model_type = self.global_config['model_type']
|
||||
if model_type == 'qwen2_vl':
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
|
||||
elif self.global_config['model_type'] == 'qwen2_5_vl':
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
||||
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
|
||||
if model_type == 'qwen2_5_omni':
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
|
||||
else:
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
# find n_wa_pattern (window attention pattern)
|
||||
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
|
||||
@@ -2773,6 +2820,66 @@ class Qwen2VLVisionModel(MmprojModel):
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2_5OmniModel")
|
||||
class Qwen25OmniModel(Qwen2VLVisionModel):
|
||||
has_vision_encoder = True
|
||||
has_audio_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_audio is not None
|
||||
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
|
||||
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_audio is not None
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("vision_config")
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("audio_config")
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# SinusoidsPositionEmbedding
|
||||
assert self.hparams_audio is not None
|
||||
max_timescale = 10000
|
||||
length = 1500
|
||||
channels = self.hparams_audio["hidden_size"]
|
||||
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
||||
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
|
||||
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
||||
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
|
||||
yield ("audio_tower.embed_positions.weight", pos_embd)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, new_name, n_dims # unused
|
||||
if ".conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
return False
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("thinker."):
|
||||
name = name.replace("thinker.", "")
|
||||
|
||||
if name.startswith("audio_tower"):
|
||||
# process audio tensors
|
||||
if "conv1.bias" in name or "conv2.bias" in name:
|
||||
# transpose conv1 and conv2 bias
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
if "audio_bos_eos_token" in name:
|
||||
# this tensor is left unused in transformers code
|
||||
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("InternVisionModel")
|
||||
class InternVisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
@@ -5993,11 +6100,11 @@ class UltravoxModel(TextModel):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument")
|
||||
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
||||
|
||||
|
||||
@ModelBase.register("UltravoxModel")
|
||||
class UltravoxAudioModel(MmprojModel):
|
||||
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
||||
class WhisperEncoderModel(MmprojModel):
|
||||
has_vision_encoder = False # no vision encoder
|
||||
has_audio_encoder = True
|
||||
|
||||
@@ -6009,10 +6116,9 @@ class UltravoxAudioModel(MmprojModel):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, new_name, n_dims # unused
|
||||
@@ -6023,6 +6129,10 @@ class UltravoxAudioModel(MmprojModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.startswith("language_model."):
|
||||
# skip language model tensors
|
||||
return []
|
||||
|
||||
# prevent clash naming with vision tensors
|
||||
if name.startswith("multi_modal_projector"):
|
||||
name = "audio." + name
|
||||
@@ -6033,6 +6143,16 @@ class UltravoxAudioModel(MmprojModel):
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("UltravoxModel")
|
||||
class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
||||
has_vision_encoder = False # no vision encoder
|
||||
has_audio_encoder = True
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
Regular → Executable
+9
@@ -280,6 +280,15 @@ cmake --build build --config release
|
||||
### **GitHub contribution**:
|
||||
Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
|
||||
|
||||
## Updates
|
||||
### Basic Flash Attention Support
|
||||
The basic FA kernel with aclnnops has been added in aclnn_ops.cpp.
|
||||
Currently, the FA only supports the cases with FP16 KV tensors and NO logit softcap.
|
||||
Since the aclnn interface for flash attention cannot support the logit softcap, we will only update the quantized version in the future.
|
||||
|
||||
Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang@pku.edu.cn), Ruiyang Ma (ruiyang@stu.pku.edu.cn), and Guojie Luo (gluo@pku.edu.cn).
|
||||
|
||||
We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
|
||||
|
||||
## TODO
|
||||
- Support more models and data types.
|
||||
|
||||
+53
-25
@@ -2,7 +2,6 @@
|
||||
|
||||
[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
|
||||
- `llama-server` when started w/ `--jinja` flag
|
||||
- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
|
||||
|
||||
## Universal support w/ Native & Generic handlers
|
||||
|
||||
@@ -325,36 +324,65 @@ To get the official template from original HuggingFace repos, you can use [scrip
|
||||
> [!TIP]
|
||||
> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
|
||||
|
||||
> [!CAUTION]
|
||||
> Beware of extreme KV quantizations (e.g. `-ctk q4_0`), they can substantially degrade the model's tool calling performance.
|
||||
|
||||
Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/chat/completions -d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"tools": [
|
||||
{
|
||||
"type":"function",
|
||||
"function":{
|
||||
"name":"python",
|
||||
"description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
|
||||
"parameters":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"code":{
|
||||
"type":"string",
|
||||
"description":"The code to run in the ipython interpreter."
|
||||
"model": "gpt-3.5-turbo",
|
||||
"tools": [
|
||||
{
|
||||
"type":"function",
|
||||
"function":{
|
||||
"name":"python",
|
||||
"description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
|
||||
"parameters":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"code":{
|
||||
"type":"string",
|
||||
"description":"The code to run in the ipython interpreter."
|
||||
}
|
||||
},
|
||||
"required":["code"]
|
||||
}
|
||||
},
|
||||
"required":["code"]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Print a hello world message with python."
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Print a hello world message with python."
|
||||
}
|
||||
]
|
||||
}'
|
||||
|
||||
|
||||
curl http://localhost:8080/v1/chat/completions -d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
|
||||
{"role": "user", "content": "What is the weather in Istanbul?"}
|
||||
],
|
||||
"tools": [{
|
||||
"type":"function",
|
||||
"function":{
|
||||
"name":"get_current_weather",
|
||||
"description":"Get the current weather in a given location",
|
||||
"parameters":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"location":{
|
||||
"type":"string",
|
||||
"description":"The city and country/state, e.g. `San Francisco, CA`, or `Paris, France`"
|
||||
}
|
||||
},
|
||||
"required":["location"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
+18
-1
@@ -33,7 +33,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
|
||||
|
||||
## Pre-quantized models
|
||||
|
||||
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org
|
||||
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
|
||||
|
||||
Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
|
||||
|
||||
@@ -81,6 +81,10 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
|
||||
# Llama 4 Scout
|
||||
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
|
||||
|
||||
# Moondream2 20250414 version
|
||||
(tool_name) -hf ggml-org/moondream2-20250414-GGUF
|
||||
|
||||
```
|
||||
|
||||
**Audio models**:
|
||||
@@ -89,4 +93,17 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
# Ultravox 0.5
|
||||
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
|
||||
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
|
||||
|
||||
# Qwen2-Audio and SeaLLM-Audio
|
||||
# note: no pre-quantized GGUF this model, as they have very poor result
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
|
||||
```
|
||||
|
||||
**Mixed modalities**:
|
||||
|
||||
```sh
|
||||
# Qwen2.5 Omni
|
||||
# Capabilities: audio input, vision input
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
||||
```
|
||||
|
||||
@@ -41,8 +41,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to process\n", __func__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
|
||||
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
}
|
||||
}
|
||||
|
||||
static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to process\n", __func__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
float * out = emb + p * n_embd;
|
||||
batch_encode(ctx, batch, out, s, n_embd);
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
common_batch_clear(batch);
|
||||
p += s;
|
||||
s = 0;
|
||||
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// final batch
|
||||
float * out = emb + p * n_embd;
|
||||
batch_encode(ctx, batch, out, s, n_embd);
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
|
||||
// save embeddings to chunks
|
||||
for (int i = 0; i < n_chunks; i++) {
|
||||
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
|
||||
batch_add_seq(query_batch, query_tokens, 0);
|
||||
|
||||
std::vector<float> query_emb(n_embd, 0);
|
||||
batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
|
||||
common_batch_clear(query_batch);
|
||||
|
||||
|
||||
@@ -10,8 +10,8 @@ Proof of concept:
|
||||
|
||||
``` sh
|
||||
export model_name=llama_3.2-1b && export quantization=f32
|
||||
./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
|
||||
./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
|
||||
./build/bin/llama-finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
|
||||
./build/bin/llama-perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
|
||||
```
|
||||
|
||||
The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
|
||||
|
||||
@@ -129,6 +129,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
|
||||
option(GGML_LSX "ggml: enable lsx" ON)
|
||||
option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
|
||||
+11
-2
@@ -528,15 +528,15 @@ extern "C" {
|
||||
GGML_UNARY_OP_STEP,
|
||||
GGML_UNARY_OP_TANH,
|
||||
GGML_UNARY_OP_ELU,
|
||||
GGML_UNARY_OP_RELU,
|
||||
GGML_UNARY_OP_SIGMOID,
|
||||
GGML_UNARY_OP_GELU,
|
||||
GGML_UNARY_OP_GELU_ERF,
|
||||
GGML_UNARY_OP_GELU_QUICK,
|
||||
GGML_UNARY_OP_SILU,
|
||||
GGML_UNARY_OP_HARDSWISH,
|
||||
GGML_UNARY_OP_HARDSIGMOID,
|
||||
GGML_UNARY_OP_EXP,
|
||||
GGML_UNARY_OP_RELU,
|
||||
GGML_UNARY_OP_GELU_ERF,
|
||||
|
||||
GGML_UNARY_OP_COUNT,
|
||||
};
|
||||
@@ -935,6 +935,15 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// repeat a to the specified shape
|
||||
GGML_API struct ggml_tensor * ggml_repeat_4d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
// sums repetitions in a into shape of b
|
||||
GGML_API struct ggml_tensor * ggml_repeat_back(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
@@ -1598,6 +1598,9 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||
for (int i = 0; i < sched->n_backends; i++) {
|
||||
ggml_backend_synchronize(sched->backends[i]);
|
||||
}
|
||||
// reset the current copy to 0 so that the graphs will be similar during generation
|
||||
// necessary for CUDA graphs
|
||||
sched->cur_copy = 0;
|
||||
}
|
||||
|
||||
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
||||
|
||||
Regular → Executable
Regular → Executable
Regular → Executable
+2
@@ -31,6 +31,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
|
||||
return ACL_FLOAT;
|
||||
case GGML_TYPE_F16:
|
||||
return ACL_FLOAT16;
|
||||
case GGML_TYPE_BF16:
|
||||
return ACL_BF16;
|
||||
case GGML_TYPE_I8:
|
||||
return ACL_INT8;
|
||||
case GGML_TYPE_I16:
|
||||
|
||||
Regular → Executable
Regular → Executable
+463
-6
@@ -66,6 +66,7 @@
|
||||
#include <aclnnop/aclnn_gt_scalar.h>
|
||||
#include <aclnnop/aclnn_pow.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v2.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
@@ -74,11 +75,13 @@
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
|
||||
#include "../ggml-common.h"
|
||||
|
||||
|
||||
void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
|
||||
aclTensor ** acl_src1, aclTensor ** acl_dst) {
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
|
||||
@@ -2697,14 +2700,10 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
}
|
||||
}
|
||||
|
||||
// GroupedMatmulV2 required tensor_list.size < 128
|
||||
size_t GROUP_SIZE = 128;
|
||||
std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
|
||||
std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
|
||||
std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
|
||||
|
||||
// split and call GroupedMatmulV2
|
||||
// GroupedMatmulV2 required tensor_list.size < 128
|
||||
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
||||
// split and call GroupedMatmulV2
|
||||
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
||||
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
||||
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
||||
@@ -2722,6 +2721,133 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs expert-specific matrix multiplication (MoE) with
|
||||
* quantized precision using the CANN backend.
|
||||
*
|
||||
* This function executes a matrix multiplication operation tailored for
|
||||
* Mixture of Experts (MoE) models, where the input tensor is multiplied
|
||||
* with expert-specific quantized weight matrices. It leverages the CANN
|
||||
* backend to perform efficient low-precision computations and stores the
|
||||
* quantized result in the destination tensor `dst`.
|
||||
*
|
||||
* Quantization techniques reduce memory footprint and improve performance
|
||||
* by using lower-bit representations (e.g., int8) instead of floating-point.
|
||||
* This function is designed to work with such formats and may incorporate
|
||||
* optimizations like identity-based fast paths or routing masks for sparse
|
||||
* expert selection.
|
||||
*
|
||||
* @param ctx The context for executing CANN backend operations.
|
||||
* @param dst The destination tensor where the quantized MoE multiplication result
|
||||
* will be stored.
|
||||
*
|
||||
* @note This function assumes quantized data types and is designed for
|
||||
* MoE architectures with potential sparse expert routing.
|
||||
*/
|
||||
static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: Use aclnnGroupedMatMul
|
||||
//dst [M, K, N, 1]
|
||||
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
|
||||
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
|
||||
ggml_tensor * ids = dst->src[2]; //ids [K, N]
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// copy index from npu to cpu
|
||||
int64_t n_as = ne02; // A
|
||||
int64_t n_ids = ids->ne[0]; // K
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
|
||||
ACL_MEMCPY_DEVICE_TO_HOST);
|
||||
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
|
||||
|
||||
char * src0_original = (char *) src0->data;
|
||||
char * src1_original = (char *) src1->data;
|
||||
char * dst_original = (char *) dst->data;
|
||||
|
||||
ggml_tensor src0_row = *src0;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
const enum ggml_type type = dst->src[0]->type;
|
||||
float weight_elem_size;
|
||||
if (type == GGML_TYPE_Q4_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
||||
} else if (type == GGML_TYPE_Q8_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t));
|
||||
} else {
|
||||
GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
|
||||
}
|
||||
|
||||
// src0_row [D, M, 1, 1] weight without permute
|
||||
src0_row.ne[2] = 1;
|
||||
src0_row.ne[3] = 1;
|
||||
src0_row.nb[0] = weight_elem_size;
|
||||
src0_row.nb[1] = weight_elem_size * ne00;
|
||||
src0_row.nb[2] = weight_elem_size * ne00;
|
||||
src0_row.nb[3] = weight_elem_size * ne00;
|
||||
size_t weight_stride = ne00 * ne01 * weight_elem_size;
|
||||
size_t weight_size = weight_stride * ne02 * ne03;
|
||||
|
||||
// scale [D, M, 1, 1] -> scale && permute
|
||||
size_t scale_elem_size = sizeof(uint16_t);
|
||||
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
||||
|
||||
// src1_row [D, 1, 1, 1] -> input
|
||||
src1_row.ne[1] = 1;
|
||||
src1_row.ne[2] = 1;
|
||||
src1_row.ne[3] = 1;
|
||||
src1_row.nb[2] = nb11;
|
||||
src1_row.nb[3] = nb11;
|
||||
|
||||
// dst_row [M, 1, 1, 1] -> out
|
||||
dst_row.ne[1] = 1;
|
||||
dst_row.ne[2] = 1;
|
||||
dst_row.ne[3] = 1;
|
||||
dst_row.nb[2] = nb1;
|
||||
dst_row.nb[3] = nb1;
|
||||
|
||||
//create weight for one row
|
||||
ggml_cann_pool_alloc weight_allocator(ctx.pool());
|
||||
void* weight_buffer = weight_allocator.alloc(nb02);
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
// expert index
|
||||
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
||||
int64_t i11 = (ne11 == 1 ? 0 : id);
|
||||
int64_t i12 = iid1;
|
||||
|
||||
int64_t i1 = id;
|
||||
int64_t i2 = i12;
|
||||
|
||||
void* src0_tmp_ptr = src0_original + i02*weight_stride;
|
||||
void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
|
||||
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
||||
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
// mem cpy
|
||||
ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
void* scale_buffer = (char*)weight_buffer + weight_stride;
|
||||
ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
|
||||
src0_row.data = weight_buffer;
|
||||
src1_row.data = src1_tmp_ptr;
|
||||
dst_row.data = dst_tmp_ptr;
|
||||
dst_row.src[0] = &src0_row;
|
||||
dst_row.src[1] = &src1_row;
|
||||
|
||||
ggml_cann_mul_mat(ctx, &dst_row);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
const enum ggml_type type = dst->src[0]->type;
|
||||
switch (type) {
|
||||
@@ -2729,8 +2855,339 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
case GGML_TYPE_F16:
|
||||
ggml_cann_mul_mat_id_fp(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
ggml_cann_mul_mat_id_quant(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported type for mul_mat_id");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
||||
|
||||
ggml_tensor* src0 = dst->src[0]; // q, fp32
|
||||
ggml_tensor* src1 = dst->src[1]; // k, fp16
|
||||
ggml_tensor* src2 = dst->src[2]; // v, fp16
|
||||
ggml_tensor* src3 = dst->src[3]; // mask, fp16
|
||||
|
||||
float maxBias = 0.0f;
|
||||
float scaleValue = 1.0f;
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&scaleValue, (float*)dst->op_params + 0, sizeof(float));
|
||||
memcpy(&maxBias, (float*)dst->op_params + 1, sizeof(float));
|
||||
memcpy(&logitSoftcap, (float*)dst->op_params + 2, sizeof(float));
|
||||
|
||||
if(logitSoftcap == 0.0f){
|
||||
size_t faElemSize = sizeof(uint16_t);
|
||||
auto faDataType = ACL_FLOAT16; //ACL_BF16;
|
||||
|
||||
aclTensor* acl_src0_f16_tensor = nullptr;
|
||||
aclTensor* acl_src1_f16_tensor = nullptr;
|
||||
aclTensor* acl_src2_f16_tensor = nullptr;
|
||||
aclTensor* acl_dst_f16_tensor = nullptr;
|
||||
|
||||
// Step 1: cast the src0 (Query) to fp16 if needed
|
||||
ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
|
||||
void* src0_f16_buffer = nullptr;
|
||||
|
||||
if(ggml_cann_type_mapping(src0->type) != faDataType){
|
||||
aclTensor* acl_src0_f32_tensor = ggml_cann_create_tensor(src0);
|
||||
src0_f16_buffer = src0_f16_allocator.alloc(
|
||||
ggml_nelements(src0) * faElemSize);
|
||||
|
||||
int64_t* src0_f16_ne = src0->ne;
|
||||
size_t src0_f16_nb[GGML_MAX_DIMS];
|
||||
src0_f16_nb[0] = sizeof(uint16_t);
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
|
||||
}
|
||||
|
||||
acl_src0_f16_tensor = ggml_cann_create_tensor(
|
||||
src0_f16_buffer, faDataType, faElemSize,
|
||||
src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS
|
||||
);
|
||||
aclnn_cast(ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, faDataType);
|
||||
ggml_cann_release_resources(ctx, acl_src0_f32_tensor);
|
||||
}else{
|
||||
acl_src0_f16_tensor = ggml_cann_create_tensor(src0);
|
||||
}
|
||||
|
||||
// Step 2: create the acl tensors for src1 (Key), src2 (Value),
|
||||
// and the direct output from FusedInferAttention
|
||||
|
||||
acl_src1_f16_tensor = ggml_cann_create_tensor(src1);
|
||||
acl_src2_f16_tensor = ggml_cann_create_tensor(src2);
|
||||
|
||||
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
||||
void* out_f16_buffer = out_f16_allocator.alloc(
|
||||
ggml_nelements(dst) * faElemSize);
|
||||
|
||||
int64_t* out_f16_ne = src0->ne;
|
||||
size_t out_f16_nb[GGML_MAX_DIMS];
|
||||
out_f16_nb[0] = faElemSize;
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
||||
}
|
||||
|
||||
acl_dst_f16_tensor = ggml_cann_create_tensor(
|
||||
out_f16_buffer, faDataType, faElemSize,
|
||||
out_f16_ne, out_f16_nb, GGML_MAX_DIMS
|
||||
);
|
||||
|
||||
// Step 3: create the PSEShift tensor if needed
|
||||
// this tensor is considered as mask (f16) in the llama.cpp
|
||||
|
||||
aclTensor* bcast_pse_tensor = nullptr;
|
||||
int64_t bcast_pse_ne[GGML_MAX_DIMS];
|
||||
size_t bcast_pse_nb[GGML_MAX_DIMS];
|
||||
ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
|
||||
void* bcast_pse_buffer = nullptr;
|
||||
|
||||
if(src3 != nullptr){
|
||||
bcast_pse_buffer = bcast_pse_allocator.alloc(
|
||||
ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
|
||||
|
||||
if(src0->ne[1] > 1){
|
||||
// Case 1: broadcast pse for prefill stage with multiple head
|
||||
aclTensor* acl_mask_f16_tensor = ggml_cann_create_tensor(src3);
|
||||
bcast_pse_ne[0] = src3->ne[0];
|
||||
bcast_pse_ne[1] = src3->ne[1];
|
||||
bcast_pse_ne[2] = src0->ne[2];
|
||||
bcast_pse_ne[3] = src3->ne[3];
|
||||
|
||||
bcast_pse_nb[0] = sizeof(uint16_t);
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
|
||||
}
|
||||
|
||||
bcast_pse_tensor = ggml_cann_create_tensor(
|
||||
bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
|
||||
bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
|
||||
|
||||
int64_t repeats[] = {1, src0->ne[2], 1, 1};
|
||||
aclnn_repeat(ctx, acl_mask_f16_tensor, bcast_pse_tensor, repeats);
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_mask_f16_tensor);
|
||||
}else{
|
||||
// Case 2: trunc the first row and broadcast pse for decode stage with multiple head
|
||||
int64_t trunc_pse_ne[GGML_MAX_DIMS] = {src3->ne[0], src0->ne[1], src3->ne[2], src3->ne[3]};
|
||||
size_t* trunc_pse_nb = src3->nb;
|
||||
|
||||
aclTensor* acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
|
||||
src3->data, ACL_FLOAT16, sizeof(uint16_t),
|
||||
trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
|
||||
|
||||
bcast_pse_ne[0] = src3->ne[0];
|
||||
bcast_pse_ne[1] = src0->ne[1];
|
||||
bcast_pse_ne[2] = src0->ne[2];
|
||||
bcast_pse_ne[3] = src3->ne[3];
|
||||
|
||||
bcast_pse_nb[0] = sizeof(uint16_t);
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
|
||||
}
|
||||
|
||||
bcast_pse_tensor = ggml_cann_create_tensor(
|
||||
bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
|
||||
bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
|
||||
|
||||
int64_t repeats[] = {1, src0->ne[2], 1, 1};
|
||||
aclnn_repeat(ctx, acl_mask_f16_trunc_tensor, bcast_pse_tensor, repeats);
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_mask_f16_trunc_tensor);
|
||||
}
|
||||
|
||||
// Compute the slope if needed. Derived from ggml_cann_softmax().
|
||||
if(maxBias != 0.0f){
|
||||
// alibi
|
||||
const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
|
||||
const int64_t n_head = src0->ne[2];
|
||||
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
||||
float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
|
||||
float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
|
||||
// init arange
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_arange_buffer = arange_allocator.get();
|
||||
|
||||
// arange1: [1, ..., n_heads_log2_floor+1)
|
||||
float start = 1;
|
||||
float stop = n_heads_log2_floor + 1;
|
||||
float step = 1;
|
||||
int64_t n_elements_arange = n_heads_log2_floor;
|
||||
|
||||
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_arange1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_arange1_ne, tmp_arange1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
||||
|
||||
aclTensor* tmp_arange2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
||||
start = 1;
|
||||
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
||||
step = 2;
|
||||
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
||||
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_arange2_nb[] = {faElemSize};
|
||||
|
||||
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_arange_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
||||
n_elements_arange);
|
||||
}
|
||||
|
||||
// init mk_base
|
||||
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
||||
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_mk_base1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base1_ne, tmp_mk_base1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
||||
|
||||
aclTensor* tmp_mk_base2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_mk_base2_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_mk_base_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
||||
}
|
||||
|
||||
// init mk
|
||||
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
||||
size_t tmp_mk_base_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
||||
|
||||
// reshape mk
|
||||
int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
|
||||
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
||||
tmp_mk_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
||||
}
|
||||
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
|
||||
|
||||
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
|
||||
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
|
||||
tmp_arange_tensor, tmp_mk_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: set the inputs for FusedInferAttention.
|
||||
int kvTensorNum = 1;
|
||||
aclTensor* acl_q_tensor = acl_src0_f16_tensor;
|
||||
aclTensor* acl_k_tensors[] = {acl_src1_f16_tensor};
|
||||
aclTensor* acl_v_tensors[] = {acl_src2_f16_tensor};
|
||||
auto acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum);
|
||||
auto acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
|
||||
|
||||
int64_t numHeads = src0->ne[2]; // N
|
||||
int64_t numKeyValueHeads = src1->ne[2];
|
||||
// double scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
|
||||
int64_t preTokens = 65535;
|
||||
int64_t nextTokens = 65535;
|
||||
char layout[5] = {'B', 'N', 'S', 'D', 0};
|
||||
int64_t sparseMode = 0;
|
||||
int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
|
||||
int64_t blockSize = 0;
|
||||
int64_t antiquantMode = 0;
|
||||
bool softmaxLseFlag = false;
|
||||
int64_t keyAntiquantMode = 0;
|
||||
int64_t valueAntiquantMode = 0;
|
||||
|
||||
// Step 5: launch the FusedInferAttentionScoreV2 kernel.
|
||||
// Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
|
||||
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
|
||||
bcast_pse_tensor, nullptr, // pse, mask
|
||||
nullptr, nullptr, // actSeqLen, actSeqLenkv
|
||||
nullptr, nullptr, // deqScale1, quantScale1
|
||||
nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
|
||||
nullptr, nullptr, // antiquantScale, antiquantOffset
|
||||
nullptr, // blockTable
|
||||
nullptr, nullptr, // qPadSize, kvPadSize
|
||||
nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
|
||||
nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
|
||||
nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
|
||||
numHeads, scaleValue, // heads, scaleValue
|
||||
preTokens, nextTokens, // preTokens, nextTokens
|
||||
layout, // inputLayout
|
||||
numKeyValueHeads, // numKVHeads
|
||||
sparseMode, innerPrecise, // sparseMode, innerPrecise
|
||||
blockSize, antiquantMode, // blockSize, antiquantMode
|
||||
softmaxLseFlag, // softmaxLseFlag
|
||||
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
|
||||
acl_dst_f16_tensor, // attentionOut
|
||||
nullptr // softmaxLse
|
||||
);
|
||||
|
||||
// Step 6: post-processing, permute and cast to f32
|
||||
|
||||
int64_t new_dim[] = {0, 2, 1, 3};
|
||||
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
|
||||
if(ggml_cann_type_mapping(dst->type) != faDataType){
|
||||
ggml_cann_pool_alloc perm_out_f16_allocator(ctx.pool());
|
||||
perm_out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
|
||||
void* perm_out_f16_buffer = perm_out_f16_allocator.get();
|
||||
|
||||
int64_t* perm_out_f16_ne = dst->ne;
|
||||
size_t perm_out_f16_nb[GGML_MAX_DIMS];
|
||||
perm_out_f16_nb[0] = faElemSize;
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
perm_out_f16_nb[i] = perm_out_f16_nb[i - 1] * perm_out_f16_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_perm_out_f16_tensor = ggml_cann_create_tensor(
|
||||
perm_out_f16_buffer, faDataType, faElemSize,
|
||||
perm_out_f16_ne, perm_out_f16_nb, GGML_MAX_DIMS);
|
||||
aclnn_permute(ctx, acl_dst_f16_tensor, acl_perm_out_f16_tensor, new_dim, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx,
|
||||
acl_perm_out_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
||||
ggml_cann_release_resources(ctx, acl_perm_out_f16_tensor);
|
||||
}else{
|
||||
// only need to permute
|
||||
aclnn_permute(ctx, acl_dst_f16_tensor, acl_dst_tensor, new_dim, GGML_MAX_DIMS);
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
|
||||
acl_src1_f16_tensor,
|
||||
acl_src2_f16_tensor,
|
||||
acl_dst_f16_tensor,
|
||||
acl_dst_tensor);
|
||||
if(src3 != nullptr){
|
||||
ggml_cann_release_resources(ctx, bcast_pse_tensor);
|
||||
}
|
||||
}else{
|
||||
GGML_ABORT("Function is not implemented.");
|
||||
}
|
||||
}
|
||||
|
||||
Regular → Executable
+15
@@ -714,6 +714,21 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
*/
|
||||
void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Performs the Flash Attention extended operator using the CANN backend.
|
||||
*
|
||||
* @details This function implements the memory-efficient Flash Attention algorithm
|
||||
* for computing scaled dot-product attention with hardware acceleration.
|
||||
* The result is stored in the destination tensor `dst`.
|
||||
*
|
||||
* This operation is accelerated using the CANN backend to improve runtime performance.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
|
||||
*/
|
||||
void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/*
|
||||
* @brief A generic wrapper for ACL resources with custom deleter support.
|
||||
*/
|
||||
|
||||
Regular → Executable
Regular → Executable
+45
@@ -36,6 +36,7 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cann/aclnn_ops.h"
|
||||
#include "ggml-cann/common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
|
||||
@@ -1748,6 +1749,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
ggml_cann_count_equal(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
ggml_cann_flash_attn_ext(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2035,6 +2039,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
#ifdef ASCEND_310P
|
||||
// Q4 && Q8 per group is not suppor on 310p device
|
||||
return false;
|
||||
#endif
|
||||
// only support contiguous for quantized types.
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op->src[1]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2168,6 +2181,38 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:{
|
||||
// derived from [ggml-cuda.cu]
|
||||
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
||||
return false;
|
||||
}
|
||||
if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
|
||||
return false;
|
||||
}
|
||||
if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
|
||||
if(logitSoftcap != 0.0f) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -299,6 +299,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endif()
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
message(STATUS "PowerPC detected")
|
||||
if (GGML_NATIVE)
|
||||
@@ -338,8 +357,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
||||
message(STATUS "RISC-V detected")
|
||||
if (GGML_RVV)
|
||||
if (GGML_RV_ZFH)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
|
||||
if (GGML_XTHEADVECTOR)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
|
||||
elseif (GGML_RV_ZFH)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
||||
endif()
|
||||
@@ -477,25 +498,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endif()
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
||||
|
||||
@@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
}
|
||||
return;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
@@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
|
||||
@@ -320,21 +320,17 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#else
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#if !defined(__riscv)
|
||||
#elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
|
||||
@@ -883,7 +883,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
|
||||
size_t vl = QK8_0;
|
||||
|
||||
@@ -1221,7 +1221,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
|
||||
size_t vl = QK8_1;
|
||||
|
||||
@@ -2384,7 +2384,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -2774,7 +2774,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -3121,7 +3121,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
@@ -3460,7 +3460,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
@@ -3897,7 +3897,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -5100,14 +5100,111 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
float sumf = 0;
|
||||
uint8_t atmp[16];
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
uint8_t *patmp = atmp;
|
||||
int vsums;
|
||||
int tmp;
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl16], e8, m1\n\t"
|
||||
"th.vmv.v.x v8, zero\n\t"
|
||||
"th.vlb.v v1, (%[sc])\n\t"
|
||||
"th.vand.vi v0, v1, 0xF\n\t"
|
||||
"th.vsrl.vi v1, v1, 4\n\t"
|
||||
"th.vsb.v v0, (%[scale])\n\t"
|
||||
"th.vwaddu.vx v16, v1, zero\n\t"
|
||||
"th.vsetvli zero, %[vl16], e16, m2\n\t"
|
||||
"th.vlh.v v2, (%[bsums])\n\t"
|
||||
"th.vwmul.vv v4, v16, v2\n\t"
|
||||
"th.vsetvli zero, %[vl16], e32, m4\n\t"
|
||||
"th.vredsum.vs v8, v4, v8\n\t"
|
||||
"th.vmv.x.s %[vsums], v8"
|
||||
: [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
|
||||
: [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
|
||||
, [vl16] "r" (16)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
sumf += dmin * vsums;
|
||||
int isum = 0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vlb.v v0, (%[q2])\n\t"
|
||||
"th.vsrl.vi v2, v0, 2\n\t"
|
||||
"th.vsrl.vi v4, v0, 4\n\t"
|
||||
"th.vsrl.vi v6, v0, 6\n\t"
|
||||
"th.vand.vi v0, v0, 0x3\n\t"
|
||||
"th.vand.vi v2, v2, 0x3\n\t"
|
||||
"th.vand.vi v4, v4, 0x3\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v8, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"th.vsetvli zero, %[vl16], e16, m2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[tmp], 8\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m2\n\t"
|
||||
"th.vlbu.v v12, (%[scale])\n\t"
|
||||
"th.vmul.vv v10, v10, v12\n\t"
|
||||
"th.vredsum.vs v0, v10, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
|
||||
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
|
||||
, [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q2 += 32; q8 += 128; patmp += 8;
|
||||
}
|
||||
|
||||
sumf += dall * isum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
float sumf = 0;
|
||||
uint8_t atmp[16];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
uint8_t atmp[16];
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
@@ -6137,14 +6234,141 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
uint32_t aux[3];
|
||||
uint32_t utmp[4];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].hmask;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
int8_t * scale = (int8_t *)utmp;
|
||||
int tmp;
|
||||
__asm__ __volatile__(
|
||||
"li %[tmp], 12\n\t"
|
||||
"th.vsetvli zero, %[tmp], e8, m1\n\t"
|
||||
"th.vlb.v v0, (%[s6b])\n\t"
|
||||
"th.vmv.v.v v2, v0\n\t"
|
||||
"li %[tmp], 2\n\t"
|
||||
"th.vsetvli zero, %[tmp], e64, m1\n\t"
|
||||
"th.vmv.v.x v9, %[sh]\n\t"\
|
||||
"th.vslidedown.vi v1, v0, 1\n\t"
|
||||
"th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
|
||||
"th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vid.v v9\n\t"
|
||||
"th.vmv.x.s %[tmp], v1\n\t"
|
||||
"th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
|
||||
"th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
|
||||
"th.vsrl.vv v4, v1, v9\n\t"
|
||||
"th.vsrl.vv v2, v0, v8\n\t"
|
||||
"th.vand.vx v5, v4, %[kmask1]\n\t"
|
||||
"th.vand.vx v3, v2, %[kmask2]\n\t"
|
||||
"th.vsll.vi v6, v5, 4\n\t"
|
||||
"th.vor.vv v7, v6, v3\n\t"
|
||||
"li %[tmp], 16\n\t"
|
||||
"th.vsetvli zero, %[tmp], e8, m1\n\t"
|
||||
"th.vsub.vx v0, v7, %[c]\n\t"
|
||||
"th.vsb.v v0, (%[scale])"
|
||||
: [tmp] "=&r" (tmp)
|
||||
: [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
|
||||
, [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
|
||||
uint8_t m = 1;
|
||||
int isum = 0;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
__asm__ __volatile__(
|
||||
// fixme: use v0p7 mask layout directly
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vlb.v v8, (%[q3])\n\t"
|
||||
"th.vsrl.vi v10, v8, 2\n\t"
|
||||
"th.vsrl.vi v12, v8, 4\n\t"
|
||||
"th.vsrl.vi v14, v8, 6\n\t"
|
||||
"th.vand.vi v8, v8, 3\n\t"
|
||||
"th.vand.vi v10, v10, 3\n\t"
|
||||
"th.vand.vi v12, v12, 3\n\t"
|
||||
"th.vlb.v v2, (%[qh])\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v8, v8, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v10, v10, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v12, v12, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v14, v14, -4, v0.t\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v0, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"li %[tmp], 16\n\t"
|
||||
"th.vsetvli zero, %[tmp], e16, m2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[tmp], 8\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m2\n\t"
|
||||
"th.vlb.v v12, (%[scale])\n\t"
|
||||
"th.vmul.vv v10, v10, v12\n\t"
|
||||
"th.vredsum.vs v0, v10, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
|
||||
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
|
||||
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q3 += 32; q8 += 128; scale += 8;
|
||||
}
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
sumf += d * isum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
uint32_t utmp[4];
|
||||
float sumf = 0;
|
||||
uint32_t aux[3];
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
@@ -6331,7 +6555,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
"vslideup.vi v13, v14, 1\n\t"
|
||||
"vslideup.vi v10, v8, 2\n\t"
|
||||
"vslideup.vi v11, v13, 2\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"\
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vle8.v v15, (%[scale])\n\t"
|
||||
"vsext.vf4 v12, v15\n\t"
|
||||
"vmul.vv v10, v10, v12\n\t"
|
||||
@@ -7180,14 +7404,130 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int tmp, tmp2, sumi;
|
||||
__asm__ __volatile__(
|
||||
"li %[t1], 12\n\t"
|
||||
"th.vsetvli zero, %[t1], e8, m1\n\t"
|
||||
"th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
|
||||
"li %[t1], 4\n\t"
|
||||
"th.vsetvli zero, %[t1], e32, m1\n\t"
|
||||
"th.vslidedown.vi v2, v1, 2\n\t"
|
||||
"th.vmv.v.v v3, v2\n\t"
|
||||
"th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
|
||||
"li %[t1], 2\n\t"
|
||||
"th.vsetvli zero, %[t1], e32, m1\n\t"
|
||||
"th.vmv.v.i v4, 4\n\t"
|
||||
"th.vand.vx v8, v1, %[kmask1]\n\t"
|
||||
"th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
|
||||
"th.vsrl.vi v6, v1, 6\n\t"
|
||||
"th.vsrl.vv v7, v2, v5\n\t"
|
||||
"th.vand.vx v0, v6, %[kmask3]\n\t"
|
||||
"th.vand.vx v2, v7, %[kmask2]\n\t"
|
||||
"th.vsll.vi v6, v0, 4\n\t"
|
||||
"li %[t2], 8\n\t"
|
||||
"addi %[t1], %[utmp], 4\n\t"
|
||||
"th.vor.vv v1, v6, v2\n\t"
|
||||
"th.vssw.v v8, (%[utmp]), %[t2]\n\t"
|
||||
"th.vssw.v v1, (%[t1]), %[t2]\n\t"
|
||||
"th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
|
||||
"th.vlw.v v2, (%[bsums])\n\t"
|
||||
"th.vsetvli zero, %[t2], e16, m1\n\t"
|
||||
"th.vnsrl.vi v0, v2, 0\n\t"
|
||||
"th.vnsrl.vi v1, v2, 16\n\t"
|
||||
"th.vadd.vv v2, v0, v1\n\t"
|
||||
"th.vlbu.v v4, (%[mins])\n\t"
|
||||
"th.vwmul.vv v6, v4, v2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vsetvli zero, %[t2], e32, m2\n\t"
|
||||
"th.vredsum.vs v0, v6, v0\n\t"
|
||||
"th.vmv.x.s %[sumi], v0"
|
||||
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
|
||||
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
|
||||
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
|
||||
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
sumf -= dmin * sumi;
|
||||
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
sumi = 0;
|
||||
const uint8_t * scale = scales;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
int vl128 = 128, vl64 = 64, vl32 = 32;
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v8, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vlb.v v0, (%[q4])\n\t"
|
||||
"th.vsrl.vi v4, v0, 4\n\t"
|
||||
"th.vand.vi v0, v0, 0xF\n\t"
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vwmul.vv v28, v6, v14\n\t"
|
||||
"th.vwmul.vv v20, v4, v10\n\t"
|
||||
"th.vwmul.vv v24, v2, v12\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vlbu.v v1, (%[scale])\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vsetvli zero, %[vl32], e16, m4\n\t"
|
||||
"th.vwredsum.vs v6, v24, v0\n\t"
|
||||
"th.vwredsum.vs v7, v28, v0\n\t"
|
||||
"th.vwredsum.vs v4, v16, v0\n\t"
|
||||
"th.vwredsum.vs v5, v20, v0\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v6, v7, 1\n\t"
|
||||
"th.vslideup.vi v4, v5, 1\n\t"
|
||||
"th.vslideup.vi v4, v6, 2\n\t"
|
||||
"th.vmul.vv v8, v4, v1\n\t"
|
||||
"th.vredsum.vs v0, v8, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[sumi], %[sumi], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
|
||||
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
|
||||
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
|
||||
q4 += 64; q8 += 128; scale += 4;
|
||||
}
|
||||
|
||||
sumf += d * sumi;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
float sumf = 0;
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
@@ -8074,7 +8414,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_v
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
@@ -9232,11 +9572,92 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * restrict q6 = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const int8_t * restrict scale = x[i].scales;
|
||||
|
||||
int sum_t = 0;
|
||||
int t0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
|
||||
"th.vlb.v v4, (%[qh])\n\t"
|
||||
"th.vsll.vi v0, v4, 4\n\t"
|
||||
"th.vsll.vi v2, v4, 2\n\t"
|
||||
"th.vsrl.vi v6, v4, 2\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
|
||||
"th.vlb.v v8, (%[q6])\n\t"
|
||||
"th.vsrl.vi v12, v8, 4\n\t"
|
||||
"th.vand.vi v8, v8, 0xF\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
|
||||
"th.vand.vx v0, v0, %[mask]\n\t"
|
||||
"th.vor.vv v8, v8, v0\n\t"
|
||||
"th.vlb.v v0, (%[q8])\n\t"
|
||||
"th.vsub.vx v8, v8, %[vl32]\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"li %[t0], 16\n\t"
|
||||
"th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[t0], 4\n\t"
|
||||
"th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[t0], 8\n\t"
|
||||
"th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
|
||||
"th.vlb.v v4, (%[scale])\n\t"
|
||||
"th.vmul.vv v2, v4, v10\n\t"
|
||||
"th.vredsum.vs v0, v2, v0\n\t"
|
||||
"th.vmv.x.s %[t0], v0\n\t"
|
||||
"add %[sumi], %[sumi], %[t0]"
|
||||
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
|
||||
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
|
||||
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
, [mask] "r" (0x30)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q6 += 64; qh += 32; q8 += 128; scale += 8;
|
||||
}
|
||||
|
||||
sumf += d * sum_t;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
float sumf = 0;
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
@@ -3484,6 +3484,19 @@ void ggml_cpu_init(void) {
|
||||
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
||||
|
||||
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
|
||||
|
||||
#ifdef GGML_USE_OPENMP
|
||||
//if (!getenv("OMP_WAIT_POLICY")) {
|
||||
// // set the wait policy to active, so that OpenMP threads don't sleep
|
||||
// putenv("OMP_WAIT_POLICY=active");
|
||||
//}
|
||||
|
||||
if (!getenv("KMP_BLOCKTIME")) {
|
||||
// set the time to wait before sleeping a thread
|
||||
// this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
|
||||
putenv("KMP_BLOCKTIME=200"); // 200ms
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__ARM_ARCH)
|
||||
|
||||
@@ -168,7 +168,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||
|
||||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
||||
|
||||
#if !defined(GGML_USE_HIP)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
static const char * cu_get_error_str(CUresult err) {
|
||||
const char * err_str;
|
||||
cuGetErrorString(err, &err_str);
|
||||
|
||||
@@ -212,6 +212,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
}
|
||||
}
|
||||
if (__all_sync(0xFFFFFFFF, skip)) {
|
||||
__syncthreads();
|
||||
continue;
|
||||
}
|
||||
#endif // GGML_USE_HIP
|
||||
|
||||
@@ -217,6 +217,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
}
|
||||
}
|
||||
if (__all_sync(0xFFFFFFFF, skip)) {
|
||||
__syncthreads();
|
||||
continue;
|
||||
}
|
||||
#endif // GGML_USE_HIP
|
||||
|
||||
@@ -2192,6 +2192,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_SILU:
|
||||
ggml_cuda_op_silu(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
ggml_cuda_op_gelu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
ggml_cuda_op_gelu_quick(ctx, dst);
|
||||
break;
|
||||
@@ -2977,6 +2980,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
|
||||
@@ -23,6 +23,12 @@ static __device__ __forceinline__ float op_gelu(float x) {
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_gelu_erf(float x) {
|
||||
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
|
||||
return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_gelu_quick(float x) {
|
||||
const float GELU_QUICK_COEF = -1.702f;
|
||||
|
||||
@@ -134,6 +140,10 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_gelu>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_gelu_erf>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
|
||||
}
|
||||
|
||||
@@ -30,6 +30,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -386,7 +386,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||
return r;
|
||||
}
|
||||
|
||||
#elif defined(__riscv) && defined(GGML_RV_ZFH)
|
||||
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
float f;
|
||||
|
||||
+227
-121
@@ -1,74 +1,93 @@
|
||||
#include "binbcast.hpp"
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1,
|
||||
dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) {
|
||||
auto element_id = it.get_global_id(0);
|
||||
auto global_range = it.get_global_range(0);
|
||||
for (; element_id < num_elements; element_id += global_range) {
|
||||
auto src0_float_val = sycl::vec(src0[element_id]).template convert<float, sycl::rounding_mode::rte>();
|
||||
auto src1_float_val = sycl::vec(src1[element_id]).template convert<float, sycl::rounding_mode::rte>();
|
||||
float dst_val = bin_op(src0_float_val[0], src1_float_val[0]);
|
||||
auto val_to_store = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
|
||||
dst[element_id] = val_to_store;
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) /
|
||||
ne3;
|
||||
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) %
|
||||
ne3;
|
||||
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
for (int i0 = i0s; i0 < ne0;
|
||||
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13,
|
||||
int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10,
|
||||
int s11, int s12, int s13, std::size_t num_dst_elements,
|
||||
const sycl::nd_item<1> & item_ct1) {
|
||||
auto calculate_logical_index =
|
||||
[](const std::array<int, 4> & dims, std::size_t element_id) __attribute__((always_inline))->std::array<int, 4> {
|
||||
std::array<int, 4> logical_index;
|
||||
#pragma unroll(4)
|
||||
for (int i = 3; i >= 0; i--) {
|
||||
logical_index[i] = element_id % dims[i];
|
||||
element_id /= dims[i];
|
||||
}
|
||||
return logical_index;
|
||||
};
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
auto calculate_index = [](const std::array<int, 4> & dims, const std::array<int, 4> & strides,
|
||||
const std::array<int, 4> & indices) __attribute__((always_inline))
|
||||
->std::size_t {
|
||||
std::size_t index = 0;
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < 4; i++) {
|
||||
auto index_i = indices[i];
|
||||
if (indices[i] >= dims[i]) {
|
||||
index_i = indices[i] % dims[i];
|
||||
}
|
||||
index += strides[i] * index_i;
|
||||
}
|
||||
return index;
|
||||
};
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
auto element_id = item_ct1.get_global_id(0);
|
||||
for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) {
|
||||
auto logical_index = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id);
|
||||
auto src_0_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index);
|
||||
auto src_1_index = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index);
|
||||
auto dst_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index);
|
||||
auto src0_float_val = sycl::vec(src0[src_0_index]).template convert<float, sycl::rounding_mode::rte>();
|
||||
auto src1_float_val = sycl::vec(src1[src_1_index]).template convert<float, sycl::rounding_mode::rte>();
|
||||
float dst_val = bin_op(src0_float_val[0], src1_float_val[0]);
|
||||
auto val_to_store = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
|
||||
dst[dst_index] = val_to_store;
|
||||
const int i3 = i/(ne2*ne1*ne0);
|
||||
const int i2 = (i/(ne1*ne0)) % ne2;
|
||||
const int i1 = (i/ne0) % ne1;
|
||||
const int i0 = i % ne0;
|
||||
|
||||
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
|
||||
template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
|
||||
|
||||
template<float (*bin_op)(const float, const float)>
|
||||
struct bin_bcast_sycl {
|
||||
template <typename src0_t, typename src1_t, typename dst_t>
|
||||
void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
|
||||
const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
|
||||
@@ -77,73 +96,165 @@ template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
|
||||
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
|
||||
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
|
||||
auto check_bcast_required = [](const std::array<int64_t, 4> & src_dims,
|
||||
const std::array<int64_t, 4> & dst_dims) -> bool {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (dst_dims[i] > src_dims[i]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
int nr0 = ne10 / ne0;
|
||||
int nr1 = ne11/ne1;
|
||||
int nr2 = ne12/ne2;
|
||||
int nr3 = ne13/ne3;
|
||||
|
||||
int nr[4] = { nr0, nr1, nr2, nr3 };
|
||||
|
||||
// collapse dimensions until first broadcast dimension
|
||||
int64_t cne[] = {ne0, ne1, ne2, ne3};
|
||||
int64_t cne0[] = {ne00, ne01, ne02, ne03};
|
||||
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
||||
size_t cnb[] = {nb0, nb1, nb2, nb3};
|
||||
size_t cnb0[] = {nb00, nb01, nb02, nb03};
|
||||
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
||||
auto collapse = [](int64_t cne[]) {
|
||||
cne[0] *= cne[1];
|
||||
cne[1] = cne[2];
|
||||
cne[2] = cne[3];
|
||||
cne[3] = 1;
|
||||
};
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
||||
cnb[1] *= cne[1];
|
||||
cnb[2] *= cne[2];
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb, cne);
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
int64_t ne0 = cne[0];
|
||||
int64_t ne1 = cne[1];
|
||||
int64_t ne2 = cne[2];
|
||||
int64_t ne3 = cne[3];
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
int64_t ne10 = cne1[0];
|
||||
int64_t ne11 = cne1[1];
|
||||
int64_t ne12 = cne1[2];
|
||||
int64_t ne13 = cne1[3];
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
size_t nb0 = cnb[0];
|
||||
size_t nb1 = cnb[1];
|
||||
size_t nb2 = cnb[2];
|
||||
size_t nb3 = cnb[3];
|
||||
|
||||
// dst strides in number of elements
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
size_t nb00 = cnb0[0];
|
||||
size_t nb01 = cnb0[1];
|
||||
size_t nb02 = cnb0[2];
|
||||
size_t nb03 = cnb0[3];
|
||||
|
||||
// src1 strides in number of elements
|
||||
size_t s10 = nb10 / sizeof(src0_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
size_t nb10 = cnb1[0];
|
||||
size_t nb11 = cnb1[1];
|
||||
size_t nb12 = cnb1[2];
|
||||
size_t nb13 = cnb1[3];
|
||||
|
||||
// src0 strides in number of elements
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
std::size_t num_dst_elements = static_cast<std::size_t>(ne0) * static_cast<std::size_t>(ne1) *
|
||||
static_cast<std::size_t>(ne2) * static_cast<std::size_t>(ne3);
|
||||
std::size_t local_range = 256;
|
||||
std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range;
|
||||
size_t s10 = nb10 / sizeof(src1_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
|
||||
bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) ||
|
||||
check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 });
|
||||
bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous;
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
if (! needs_broadcasting && all_contiguous) {
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
|
||||
k_bin_bcast_contiguous<bin_op>(src0_dd, src1_dd, dst_dd, num_dst_elements, it);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1,
|
||||
s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it);
|
||||
});
|
||||
});
|
||||
GGML_UNUSED(s00);
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
||||
|
||||
sycl::range<3> block_dims(1, 1, 1);
|
||||
block_dims[2] = std::min<unsigned int>(hne0, block_size);
|
||||
block_dims[1] = std::min<unsigned int>(
|
||||
ne1, block_size / (unsigned int)block_dims[2]);
|
||||
block_dims[0] = std::min(
|
||||
std::min<unsigned int>(
|
||||
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
|
||||
(unsigned int)block_dims[1]),
|
||||
64U);
|
||||
|
||||
sycl::range<3> block_nums(
|
||||
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
|
||||
(ne1 + block_dims[1] - 1) / block_dims[1],
|
||||
(hne0 + block_dims[2] - 1) / block_dims[2]);
|
||||
|
||||
if (block_nums[0] > 65535) {
|
||||
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
||||
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
|
||||
sycl::range<3>(1, 1, block_size),
|
||||
sycl::range<3>(1, 1, block_size)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
|
||||
s03, s11, s12, s13, item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:16: The work-group size passed to the SYCL kernel may
|
||||
exceed the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if
|
||||
needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -208,32 +319,27 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *ds
|
||||
|
||||
|
||||
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_add(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_sub(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_mul(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_div(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_repeat(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,8 +13,10 @@
|
||||
#ifndef GGML_SYCL_COMMON_HPP
|
||||
#define GGML_SYCL_COMMON_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml-sycl.h"
|
||||
@@ -44,11 +46,20 @@ extern int g_ggml_sycl_debug;
|
||||
extern int g_ggml_sycl_disable_optimize;
|
||||
extern int g_ggml_sycl_prioritize_dmmv;
|
||||
|
||||
#define GGML_SYCL_DEBUG(...) \
|
||||
do { \
|
||||
if (g_ggml_sycl_debug) \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
} while (0)
|
||||
#if defined(__clang__) && __has_builtin(__builtin_expect)
|
||||
// Hint the optimizer to pipeline the more likely following instruction in branches
|
||||
# define LIKELY(expr) __builtin_expect(expr, true)
|
||||
# define UNLIKELY(expr) __builtin_expect(expr, false)
|
||||
#else
|
||||
# define LIKELY(expr) (expr)
|
||||
# define UNLIKELY(expr) (expr)
|
||||
#endif
|
||||
|
||||
#define GGML_SYCL_DEBUG(...) \
|
||||
do { \
|
||||
if (UNLIKELY(g_ggml_sycl_debug)) \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define CHECK_TRY_ERROR(expr) \
|
||||
[&]() { \
|
||||
@@ -471,6 +482,19 @@ static __dpct_inline__ float warp_reduce_max(float x,
|
||||
return x;
|
||||
}
|
||||
|
||||
/* Helper for Computing the linear offset of a ggml_tensor given
|
||||
per-dimension sizes, strides, and indices */
|
||||
template<int N>
|
||||
__dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
|
||||
size_t offset = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N; i++) {
|
||||
auto index_i = indices[i];
|
||||
offset += strides[i] * index_i;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
// Helper for vec loading aligned data
|
||||
template <typename Tp, int n>
|
||||
inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
|
||||
@@ -490,4 +514,76 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
||||
}
|
||||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
|
||||
template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << prefix << "=[";
|
||||
for (std::size_t i = 0; i < N - 1; ++i) {
|
||||
ss << array[i] << ", ";
|
||||
}
|
||||
if constexpr (N > 0) {
|
||||
ss << array[N - 1];
|
||||
}
|
||||
ss << "]";
|
||||
GGML_SYCL_DEBUG("%s", ss.str().c_str());
|
||||
}
|
||||
|
||||
inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
|
||||
const std::string & suffix = "") {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s=", prefix.c_str());
|
||||
if (tensor) {
|
||||
GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
|
||||
debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
|
||||
debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
|
||||
if (!ggml_is_contiguous(tensor)) {
|
||||
GGML_SYCL_DEBUG(";strided");
|
||||
}
|
||||
if (ggml_is_permuted(tensor)) {
|
||||
GGML_SYCL_DEBUG(";permuted");
|
||||
}
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("nullptr");
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s", suffix.c_str());
|
||||
}
|
||||
|
||||
// Use scope_op_debug_print to log operations coming from running a model
|
||||
struct scope_op_debug_print {
|
||||
// Use string_views to avoid the cost of creating a string and concatenating them
|
||||
// string_views must be alive for as long as the object is alive
|
||||
// scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
|
||||
scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
|
||||
std::size_t num_src, const std::string_view & suffix = "") :
|
||||
func(func),
|
||||
func_suffix(func_suffix) {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
}
|
||||
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
|
||||
debug_print_tensor(" dst", dst);
|
||||
if (dst) {
|
||||
for (std::size_t i = 0; i < num_src; ++i) {
|
||||
debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
|
||||
}
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s\n", suffix.data());
|
||||
}
|
||||
|
||||
scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
|
||||
const std::string_view & suffix = "") :
|
||||
scope_op_debug_print(func, "", dst, num_src, suffix) {}
|
||||
|
||||
~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
|
||||
|
||||
private:
|
||||
std::string_view func;
|
||||
std::string_view func_suffix;
|
||||
};
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
@@ -159,39 +159,37 @@ static void concat_f32_sycl_non_cont(
|
||||
}
|
||||
|
||||
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
queue_ptr stream = ctx.stream();
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
queue_ptr stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *)dst->op_params)[0];
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float *src0_d = (const float *)src0->data;
|
||||
const float *src1_d = (const float *)src1->data;
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
|
||||
float *dst_d = (float *)dst->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_sycl(
|
||||
src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
|
||||
src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
|
||||
dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
} else {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
||||
}
|
||||
} else {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
||||
concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
|
||||
src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
||||
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||
}
|
||||
} else
|
||||
concat_f32_sycl_non_cont(
|
||||
stream, (const char *)src0->data, (const char *)src1->data,
|
||||
(char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
|
||||
src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
|
||||
src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
||||
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||
}
|
||||
|
||||
@@ -72,6 +72,7 @@ static void conv_transpose_1d_f32_f32_sycl(
|
||||
}
|
||||
|
||||
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
||||
@@ -616,6 +616,9 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
|
||||
}
|
||||
|
||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
||||
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
|
||||
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
|
||||
std::string(" src0 type=") + ggml_type_name(src0->type));
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
@@ -629,8 +632,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
||||
|
||||
char * src0_ddc = (char *) src0->data;
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
|
||||
ggml_type_name(src1->type));
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
@@ -694,8 +695,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
||||
}
|
||||
|
||||
void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
// TODO: why do we pass dst as src1 here?
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_cpy(ctx, dst->src[0], dst);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
|
||||
}
|
||||
|
||||
@@ -1092,6 +1092,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
||||
|
||||
if (src1_convert_f16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp16");
|
||||
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
|
||||
@@ -84,6 +84,15 @@ static void gelu_quick(const T *x, T *dst, int k,
|
||||
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
||||
const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
|
||||
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
||||
auto x_i = x[i];
|
||||
dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void tanh(const T *x, T *dst, int k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
@@ -400,6 +409,20 @@ static void gelu_quick_sycl(const T *x, T *dst, const int k,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
static void gelu_erf_sycl(const T *x, T *dst, const int k,
|
||||
queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
gelu_erf(x, dst, k, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void tanh_sycl(const T *x, T *dst, const int k,
|
||||
queue_ptr stream) {
|
||||
@@ -816,6 +839,38 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
#else
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
#endif
|
||||
GGML_ASSERT(dst->src[0]->type == dst->type);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
switch (dst->type) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
auto data_pts = cast_data<sycl::half>(dst);
|
||||
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
auto data_pts = cast_data<float>(dst);
|
||||
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
@@ -1391,146 +1446,126 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
|
||||
|
||||
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sqrt(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sin(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_cos(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_acc(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_gelu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_silu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_gelu_quick(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_gelu_erf(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_tanh(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_relu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sigmoid(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_hardsigmoid(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_hardswish(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_exp(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_log(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_neg(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_step(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_leaky_relu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sqr(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_upscale(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_pad(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_clamp(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sgn(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_abs(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_elu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
@@ -38,6 +38,8 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -257,8 +257,7 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
GGML_UNUSED(ctx);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -308,4 +307,3 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -346,6 +346,8 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static enum ggml_status
|
||||
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor, "\n");
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
@@ -381,7 +383,9 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
||||
@@ -407,7 +411,9 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
@@ -435,7 +441,12 @@ static bool
|
||||
ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *src,
|
||||
ggml_tensor *dst) try {
|
||||
if (ggml_backend_buffer_is_sycl(src->buffer)) {
|
||||
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": dst=", dst);
|
||||
debug_print_tensor(" src=", src);
|
||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||
if (is_cpy_supported) {
|
||||
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
||||
ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
|
||||
|
||||
@@ -492,7 +503,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
||||
uint8_t value) try {
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s: size=%zu\n", __func__, buffer->size);
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
||||
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
queue_ptr stream = ctx->stream;
|
||||
@@ -511,7 +523,9 @@ catch (sycl::exception const &exc) {
|
||||
|
||||
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
||||
size_t offset, size_t size) {
|
||||
GGML_SYCL_DEBUG(" [SYCL] call %s\n", __func__);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
||||
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
||||
@@ -789,6 +803,8 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
|
||||
static enum ggml_status
|
||||
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor, "\n");
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
@@ -873,6 +889,9 @@ static void
|
||||
ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor, const void *data,
|
||||
size_t offset, size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
@@ -926,6 +945,9 @@ static void
|
||||
ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor, void *data,
|
||||
size_t offset, size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
@@ -2015,12 +2037,12 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
#else
|
||||
bool use_fp16 = false;
|
||||
#endif
|
||||
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
|
||||
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
|
||||
row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
||||
if (src0->type != GGML_TYPE_F16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
||||
" : converting src0 to fp16");
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
size_t ne = row_diff*ne00;
|
||||
@@ -2033,6 +2055,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp16");
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
size_t ne = src1_ncols*ne10;
|
||||
@@ -2049,6 +2073,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting dst to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
||||
}
|
||||
@@ -2064,21 +2090,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||
dpct::library_data_t::real_half)));
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting dst to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
|
||||
} else {
|
||||
ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
|
||||
if (src0->type != GGML_TYPE_F32) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting src0 to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
|
||||
GGML_ASSERT(to_fp32_sycl != nullptr);
|
||||
src0_ddq_as_f32.alloc(row_diff*ne00);
|
||||
to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
||||
}
|
||||
if (src1->type != GGML_TYPE_F32) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp32_sycl != nullptr);
|
||||
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
||||
@@ -2114,8 +2144,7 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
@@ -2167,8 +2196,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
@@ -2199,8 +2227,7 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
|
||||
argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
|
||||
@@ -2215,8 +2242,7 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *ds
|
||||
argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tensor *dst) {
|
||||
|
||||
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
@@ -2233,8 +2259,7 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tens
|
||||
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
@@ -2421,6 +2446,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
||||
/*
|
||||
DPCT1010:90: SYCL uses exceptions to report errors and does not
|
||||
@@ -2525,6 +2552,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||
/*
|
||||
DPCT1010:92: SYCL uses exceptions to report errors and does
|
||||
@@ -2619,33 +2648,28 @@ catch (sycl::exception const &exc) {
|
||||
|
||||
|
||||
static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_get_rows(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_norm(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_rms_norm(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_l2_norm(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_group_norm(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
@@ -2773,6 +2797,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
|
||||
// convert src1 to fp16
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp16");
|
||||
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
|
||||
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
@@ -3076,6 +3102,7 @@ static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor *
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
|
||||
@@ -3153,7 +3180,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
constexpr bool convert_src1_to_q8_1 = false;
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
|
||||
}
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
@@ -3224,6 +3250,7 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
|
||||
|
||||
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
ggml_tensor *dst) try {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
||||
@@ -3392,37 +3419,45 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_scale(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_diag_mask_inf(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_pool2d(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_im2col(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
||||
ggml_sycl_op_sum(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
||||
ggml_sycl_op_sum_rows(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
||||
ggml_sycl_op_argsort(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
||||
ggml_sycl_op_argmax(ctx, dst);
|
||||
}
|
||||
@@ -3508,6 +3543,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
ggml_sycl_gelu_quick(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
ggml_sycl_gelu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
ggml_sycl_tanh(ctx, dst);
|
||||
break;
|
||||
@@ -3716,6 +3754,9 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
@@ -3734,6 +3775,9 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
@@ -3752,7 +3796,13 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *src,
|
||||
ggml_tensor *dst) try {
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
|
||||
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
||||
ggml_backend_buffer_is_sycl(src->buffer);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": dst=", dst);
|
||||
debug_print_tensor(" src=", src);
|
||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||
if (is_cpy_supported) {
|
||||
/*
|
||||
DPCT1009:215: SYCL uses exceptions to report errors and does not use the
|
||||
error codes. The original code was commented out and a warning string
|
||||
@@ -3773,6 +3823,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
|
||||
@@ -3906,7 +3957,7 @@ catch (sycl::exception const &exc)
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
|
||||
|
||||
if (ggml_backend_is_sycl(backend)) {
|
||||
@@ -4048,6 +4099,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
@@ -4193,6 +4245,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
#endif
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
@@ -4301,6 +4354,7 @@ static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_bac
|
||||
|
||||
static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
|
||||
GGML_UNUSED(dev);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
|
||||
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
|
||||
|
||||
@@ -76,6 +76,7 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
|
||||
}
|
||||
|
||||
void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/5);
|
||||
const float * k_d = static_cast<const float *>(dst->src[0]->data);
|
||||
const float * v_d = static_cast<const float *>(dst->src[1]->data);
|
||||
const float * r_d = static_cast<const float *>(dst->src[2]->data);
|
||||
|
||||
@@ -1059,8 +1059,10 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
case GGML_TYPE_Q4_K:
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
|
||||
reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
|
||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
|
||||
+94
-67
@@ -1,40 +1,50 @@
|
||||
#include "norm.hpp"
|
||||
#include "ggml-sycl/common.hpp"
|
||||
#include "ggml-sycl/presets.hpp"
|
||||
|
||||
static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||
const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
||||
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||
|
||||
const int nrows = item_ct1.get_group_range(2);
|
||||
const int nchannels = item_ct1.get_group_range(1);
|
||||
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
const int sample = item_ct1.get_group(0);
|
||||
const int channel = item_ct1.get_group(1);
|
||||
const int row = item_ct1.get_group(2);
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
|
||||
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
|
||||
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
|
||||
|
||||
x += strided_offset;
|
||||
dst += packed_offset;
|
||||
|
||||
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row * ncols + col];
|
||||
const float xi = x[col];
|
||||
mean_var.x() += xi;
|
||||
mean_var.y() += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = mean_var;
|
||||
if (block_size > WARP_SIZE) {
|
||||
const auto sub_group = item_ct1.get_sub_group();
|
||||
const auto sg_id = sub_group.get_group_linear_id();
|
||||
const auto wi_in_sg = sub_group.get_local_linear_id();
|
||||
if (wi_in_sg == 0) {
|
||||
s_sum[sg_id] = mean_var;
|
||||
}
|
||||
/*
|
||||
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
mean_var = 0.f;
|
||||
size_t nreduce = nwarps / WARP_SIZE;
|
||||
const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
|
||||
for (size_t i = 0; i < nreduce; i += 1)
|
||||
{
|
||||
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
||||
mean_var += s_sum[wi_in_sg + i * WARP_SIZE];
|
||||
}
|
||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||
}
|
||||
@@ -44,7 +54,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
|
||||
const float inv_std = sycl::rsqrt(var + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
|
||||
dst[col] = (x[col] - mean) * inv_std;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,39 +145,51 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
||||
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||
|
||||
const int nrows = item_ct1.get_group_range(2);
|
||||
const int nchannels = item_ct1.get_group_range(1);
|
||||
|
||||
const int sample = item_ct1.get_group(0);
|
||||
const int channel = item_ct1.get_group(1);
|
||||
const int row = item_ct1.get_group(2);
|
||||
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
|
||||
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
|
||||
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
|
||||
|
||||
x += strided_offset;
|
||||
dst += packed_offset;
|
||||
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row * ncols + col];
|
||||
const float xi = x[col];
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
const auto sub_group = item_ct1.get_sub_group();
|
||||
const auto sg_id = sub_group.get_group_linear_id();
|
||||
const auto wi_in_sg = sub_group.get_local_linear_id();
|
||||
if (wi_in_sg == 0) {
|
||||
s_sum[sg_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
size_t nreduce = nwarps / WARP_SIZE;
|
||||
const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
|
||||
tmp = 0.f;
|
||||
for (size_t i = 0; i < nreduce; i += 1)
|
||||
{
|
||||
tmp += s_sum[lane_id + i * WARP_SIZE];
|
||||
tmp += s_sum[wi_in_sg + i * WARP_SIZE];
|
||||
}
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
@@ -176,7 +198,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
|
||||
const float scale = sycl::rsqrt(mean + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row * ncols + col] = scale * x[row * ncols + col];
|
||||
dst[col] = scale * x[col];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -224,20 +246,20 @@ static void l2_norm_f32(const float* x, float* dst, const int ncols, const float
|
||||
}
|
||||
}
|
||||
|
||||
static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
const int nrows, const float eps,
|
||||
queue_ptr stream, int device) {
|
||||
static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
||||
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
|
||||
const float eps, queue_ptr stream, int device) {
|
||||
|
||||
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -252,15 +274,12 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
*/
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
||||
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
||||
|
||||
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -313,21 +332,20 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
const int nrows, const float eps,
|
||||
queue_ptr stream, int device) {
|
||||
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
||||
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
||||
|
||||
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -344,12 +362,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||
cgh);
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -398,12 +414,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
}
|
||||
|
||||
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = dst->src[0]->ne[0];
|
||||
const int64_t nrows = ggml_nrows(dst->src[0]);
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
||||
@@ -411,8 +427,14 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
const size_t ts0 = ggml_type_size(src0->type);
|
||||
GGML_ASSERT(nb00 == ts0);
|
||||
const int64_t s01 = nb01 / ts0;
|
||||
const int64_t s02 = nb02 / ts0;
|
||||
const int64_t s03 = nb03 / ts0;
|
||||
|
||||
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
|
||||
norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
@@ -436,11 +458,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = dst->src[0]->ne[0];
|
||||
const int64_t nrows = ggml_nrows(dst->src[0]);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
|
||||
@@ -450,7 +471,13 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
const size_t ts0 = ggml_type_size(src0->type);
|
||||
GGML_ASSERT(nb00 == ts0);
|
||||
const int64_t s01 = nb01 / ts0;
|
||||
const int64_t s02 = nb02 / ts0;
|
||||
const int64_t s03 = nb03 / ts0;
|
||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "outprod.hpp"
|
||||
|
||||
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
|
||||
@@ -355,8 +355,7 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
}
|
||||
|
||||
void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
||||
ggml_sycl_op_rope(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ static void soft_max_f32_sycl(const float * x, const T * mask,
|
||||
}
|
||||
|
||||
void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -249,16 +249,13 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
|
||||
const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
|
||||
GGML_SYCL_DEBUG("%s: F16 mask\n", __func__);
|
||||
soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
|
||||
main_stream, ctx.device);
|
||||
} else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
|
||||
const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
|
||||
GGML_SYCL_DEBUG("%s: F32 mask\n", __func__);
|
||||
soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
||||
} else {
|
||||
/* mask unavailable */
|
||||
GGML_SYCL_DEBUG("%s: No mask\n", __func__);
|
||||
soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,8 +56,8 @@ static void timestep_embedding_f32_sycl(
|
||||
}
|
||||
|
||||
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
@@ -69,5 +69,4 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
const int max_period = dst->op_params[1];
|
||||
|
||||
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
|
||||
GGML_UNUSED(src1);
|
||||
}
|
||||
|
||||
@@ -180,10 +180,7 @@ static void rwkv_wkv7_f32_kernel(
|
||||
}
|
||||
|
||||
void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/6);
|
||||
const float* k_d = (const float*)dst->src[0]->data;
|
||||
const float* v_d = (const float*)dst->src[1]->data;
|
||||
const float* r_d = (const float*)dst->src[2]->data;
|
||||
@@ -236,16 +233,10 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/7);
|
||||
const float* r_d = (const float*)dst->src[0]->data;
|
||||
const float* w_d = (const float*)dst->src[1]->data;
|
||||
const float* k_d = (const float*)dst->src[2]->data;
|
||||
@@ -299,7 +290,4 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
}
|
||||
|
||||
@@ -2804,23 +2804,29 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
pipeline_robustness = true;
|
||||
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
||||
device->subgroup_size_control = true;
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
||||
device->coopmat_support = true;
|
||||
device->coopmat_m = 0;
|
||||
device->coopmat_n = 0;
|
||||
device->coopmat_k = 0;
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||
coopmat2_support = true;
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
|
||||
device->integer_dot_product = true;
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_BFLOAT16")) {
|
||||
bfloat16_support = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4670,6 +4676,19 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
}
|
||||
|
||||
if (src->type == to) {
|
||||
// Copy two or four bytes at a time, depending on block size.
|
||||
// For quantized types, we scale by block size/type size. But
|
||||
// this path is also used for bf16->bf16 for example, where the
|
||||
// type size must be exactly 2 or 4.
|
||||
GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4);
|
||||
if ((ggml_type_size(src->type) % 4) == 0) {
|
||||
return ctx->device->pipeline_contig_cpy_f32_f32;
|
||||
} else {
|
||||
return ctx->device->pipeline_contig_cpy_f16_f16;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -6433,6 +6452,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -6731,7 +6751,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
{
|
||||
const uint32_t ne = ggml_nelements(dst);
|
||||
uint32_t ne = ggml_nelements(dst);
|
||||
if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
|
||||
// Convert from number of logical elements to 2- or 4-byte units.
|
||||
ne /= ggml_blck_size(src0->type);
|
||||
if ((ggml_type_size(src0->type) % 4) == 0) {
|
||||
ne *= ggml_type_size(src0->type) / 4;
|
||||
} else {
|
||||
ne *= ggml_type_size(src0->type) / 2;
|
||||
}
|
||||
}
|
||||
if (ne > 262144) {
|
||||
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
||||
} else if (ne > 512) {
|
||||
@@ -7281,8 +7310,19 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
uint32_t ne = (uint32_t)ggml_nelements(src0);
|
||||
if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
|
||||
// Convert from number of logical elements to 2- or 4-byte units.
|
||||
ne /= ggml_blck_size(src0->type);
|
||||
if ((ggml_type_size(src0->type) % 4) == 0) {
|
||||
ne *= ggml_type_size(src0->type) / 4;
|
||||
} else {
|
||||
ne *= ggml_type_size(src0->type) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
ne,
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
@@ -9264,8 +9304,7 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
|
||||
try {
|
||||
ptr = ggml_vk_host_malloc(vk_instance.devices[0], size);
|
||||
} catch (vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what());
|
||||
// fallback to cpu buffer
|
||||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
@@ -9867,6 +9906,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// We can handle copying from a type to the same type if it's
|
||||
// contiguous (memcpy). We use f16 or f32 shaders to do the copy,
|
||||
// so the type/block size must be a multiple of 4.
|
||||
if (src0_type == src1_type &&
|
||||
ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op) &&
|
||||
(ggml_type_size(src0_type) % 2) == 0) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_REPEAT:
|
||||
|
||||
@@ -2312,6 +2312,26 @@ struct ggml_tensor * ggml_repeat(
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_repeat_4d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
const bool can_repeat = ggml_is_empty(a) || (
|
||||
(ne0 % a->ne[0] == 0) &&
|
||||
(ne1 % a->ne[1] == 0) &&
|
||||
(ne2 % a->ne[2] == 0) &&
|
||||
(ne3 % a->ne[3] == 0)
|
||||
);
|
||||
GGML_ASSERT(can_repeat);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
|
||||
|
||||
result->op = GGML_OP_REPEAT;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_repeat_back
|
||||
|
||||
struct ggml_tensor * ggml_repeat_back(
|
||||
|
||||
@@ -546,6 +546,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
A_ENC_FFN_GATE = auto()
|
||||
A_ENC_FFN_DOWN = auto()
|
||||
A_MMPROJ = auto()
|
||||
A_MMPROJ_FC = auto()
|
||||
A_MM_NORM_PRE = auto()
|
||||
A_MM_NORM_MID = auto()
|
||||
|
||||
@@ -825,6 +826,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
|
||||
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
|
||||
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
|
||||
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
|
||||
}
|
||||
@@ -885,6 +887,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE,
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.A_MMPROJ,
|
||||
MODEL_TENSOR.A_MMPROJ_FC,
|
||||
MODEL_TENSOR.A_MM_NORM_PRE,
|
||||
MODEL_TENSOR.A_MM_NORM_MID,
|
||||
],
|
||||
@@ -2256,6 +2259,8 @@ class VisionProjectorType:
|
||||
QWEN25VL = "qwen2.5vl_merger"
|
||||
ULTRAVOX = "ultravox"
|
||||
INTERNVL = "internvl"
|
||||
QWEN2A = "qwen2a" # audio
|
||||
QWEN25O = "qwen2.5o" # omni
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -1125,6 +1125,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.A_POST_NORM: (
|
||||
"audio_tower.layer_norm", # ultravox
|
||||
"audio_tower.ln_post", # qwen2omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
||||
@@ -1161,10 +1162,18 @@ class TensorNameMap:
|
||||
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||
),
|
||||
|
||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||
# this prefix is added in the conversion code in modify_tensors()
|
||||
|
||||
MODEL_TENSOR.A_MMPROJ: (
|
||||
"audio.multi_modal_projector.linear_{bid}", # ultravox
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MMPROJ_FC: (
|
||||
"audio.multi_modal_projector.linear", # qwen2audio
|
||||
"audio_tower.proj", # qwen2omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_NORM_PRE: (
|
||||
"audio.multi_modal_projector.ln_pre", # ultravox
|
||||
),
|
||||
|
||||
+3
-2
@@ -471,6 +471,7 @@ extern "C" {
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API size_t llama_max_devices(void);
|
||||
LLAMA_API size_t llama_max_parallel_sequences(void);
|
||||
|
||||
LLAMA_API bool llama_supports_mmap (void);
|
||||
LLAMA_API bool llama_supports_mlock (void);
|
||||
@@ -611,11 +612,11 @@ extern "C" {
|
||||
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
||||
"Use llama_kv_self_seq_pos_max() instead");
|
||||
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
||||
|
||||
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
||||
"Use llama_kv_self_seq_pos_max() instead");
|
||||
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
||||
|
||||
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
||||
LLAMA_API void llama_kv_self_clear(
|
||||
|
||||
Binary file not shown.
@@ -0,0 +1,112 @@
|
||||
ied 4 ½ months
|
||||
__ggml_vocab_test__
|
||||
Führer
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
Hello world
|
||||
__ggml_vocab_test__
|
||||
Hello world
|
||||
__ggml_vocab_test__
|
||||
Hello World
|
||||
__ggml_vocab_test__
|
||||
Hello World
|
||||
__ggml_vocab_test__
|
||||
Hello World!
|
||||
__ggml_vocab_test__
|
||||
Hello, world!
|
||||
__ggml_vocab_test__
|
||||
Hello, world!
|
||||
__ggml_vocab_test__
|
||||
this is 🦙.cpp
|
||||
__ggml_vocab_test__
|
||||
w048 7tuijk dsdfhu
|
||||
__ggml_vocab_test__
|
||||
нещо на Български
|
||||
__ggml_vocab_test__
|
||||
កាន់តែពិសេសអាចខលចេញ
|
||||
__ggml_vocab_test__
|
||||
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
(
|
||||
__ggml_vocab_test__
|
||||
|
||||
=
|
||||
__ggml_vocab_test__
|
||||
' era
|
||||
__ggml_vocab_test__
|
||||
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
||||
__ggml_vocab_test__
|
||||
!!!!!!
|
||||
__ggml_vocab_test__
|
||||
3
|
||||
__ggml_vocab_test__
|
||||
33
|
||||
__ggml_vocab_test__
|
||||
333
|
||||
__ggml_vocab_test__
|
||||
3333
|
||||
__ggml_vocab_test__
|
||||
33333
|
||||
__ggml_vocab_test__
|
||||
333333
|
||||
__ggml_vocab_test__
|
||||
3333333
|
||||
__ggml_vocab_test__
|
||||
33333333
|
||||
__ggml_vocab_test__
|
||||
333333333
|
||||
__ggml_vocab_test__
|
||||
Cửa Việt
|
||||
__ggml_vocab_test__
|
||||
discards
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
||||
__ggml_vocab_test__
|
||||
@@ -0,0 +1,46 @@
|
||||
17 297 201 78660 21775
|
||||
72805 4097 56
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
35378 8999
|
||||
35378 8999
|
||||
35378 6661
|
||||
35378 6661
|
||||
35378 6661 38
|
||||
35378 4 8999 38
|
||||
35378 4 8999 38
|
||||
903 83 6 3 5 238 6366
|
||||
148 7709 1019 361 458 134362 104 7 71 420 1132
|
||||
14271 29 117152
|
||||
6 149561 78270 48967 64254 7616 81705
|
||||
6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 15 191 538 28 121505 450 1556 6863 10002 47 1098 16
|
||||
35378
|
||||
35378
|
||||
35378
|
||||
35378
|
||||
35378
|
||||
35378 35378
|
||||
15
|
||||
2203
|
||||
242 1615
|
||||
35378 4 113 25 5584 38 11249 621 398 6 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087
|
||||
6 90827
|
||||
138
|
||||
3912
|
||||
6 66000
|
||||
138 66000
|
||||
3912 66000
|
||||
6 66000 66000
|
||||
138 66000 66000
|
||||
3912 66000 66000
|
||||
6 66000 66000 66000
|
||||
199152 3763
|
||||
17116 99397
|
||||
6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 6 3 138 3912 6 66000 138 66000 3912 66000 6 66000 66000 138 66000 66000 3912 66000 66000 80308 1031 5 363 138 27 363 6 149561 78270 48967 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087 6 110405 1369 69112 69112 69112 14271 29 117152 5106 4765 4765 1135 164721 164721 164721 58 58 58 58 2551 90827 32 85908 87 25 272 2809 242 18 18345 764 25 7 2685 4 242 11766 398 9077 32 242 594 959 9077 87 25 1181 3249 442 4 242 397 398 1884 3060 26156 32 1401 25 26455 10 25 141 866
|
||||
@@ -0,0 +1,62 @@
|
||||
{%- if tools %}
|
||||
{{- '<|im_start|>system\n' }}
|
||||
{%- if messages[0]['role'] == 'system' %}
|
||||
{{- messages[0]['content'] }}
|
||||
{%- else %}
|
||||
{{- '' }}
|
||||
{%- endif %}
|
||||
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "\n" }}
|
||||
{{- tool | tojson }}
|
||||
{%- endfor %}
|
||||
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||
{%- else %}
|
||||
{%- if messages[0]['role'] == 'system' %}
|
||||
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- for message in messages %}
|
||||
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" and not message.tool_calls %}
|
||||
{%- set content = message.content %}
|
||||
{%- if not loop.last %}
|
||||
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- set content = message.content %}
|
||||
{%- if not loop.last %}
|
||||
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{{- '<|im_start|>' + message.role }}
|
||||
{%- if message.content %}
|
||||
{{- '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if tool_call.function is defined %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_call>\n{"name": "' }}
|
||||
{{- tool_call.name }}
|
||||
{{- '", "arguments": ' }}
|
||||
{{- tool_call.arguments | tojson }}
|
||||
{{- '}\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif message.role == "tool" %}
|
||||
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
||||
{{- '<|im_start|>user' }}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_response>\n' }}
|
||||
{{- message.content }}
|
||||
{{- '\n</tool_response>' }}
|
||||
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n<think>\n' }}
|
||||
{%- endif %}
|
||||
@@ -0,0 +1,85 @@
|
||||
{%- if tools %}
|
||||
{{- '<|im_start|>system\n' }}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{{- messages[0].content + '\n\n' }}
|
||||
{%- endif %}
|
||||
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "\n" }}
|
||||
{{- tool | tojson }}
|
||||
{%- endfor %}
|
||||
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||
{%- else %}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||
{%- for message in messages[::-1] %}
|
||||
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
||||
{%- set ns.multi_step_tool = false %}
|
||||
{%- set ns.last_query_index = index %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- for message in messages %}
|
||||
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- set content = message.content %}
|
||||
{%- set reasoning_content = '' %}
|
||||
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
||||
{%- set reasoning_content = message.reasoning_content %}
|
||||
{%- else %}
|
||||
{%- if '</think>' in message.content %}
|
||||
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if loop.index0 > ns.last_query_index %}
|
||||
{%- if loop.last or (not loop.last and reasoning_content) %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- if message.tool_calls %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if (loop.first and content) or (not loop.first) %}
|
||||
{{- '\n' }}
|
||||
{%- endif %}
|
||||
{%- if tool_call.function %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{{- '<tool_call>\n{"name": "' }}
|
||||
{{- tool_call.name }}
|
||||
{{- '", "arguments": ' }}
|
||||
{%- if tool_call.arguments is string %}
|
||||
{{- tool_call.arguments }}
|
||||
{%- else %}
|
||||
{{- tool_call.arguments | tojson }}
|
||||
{%- endif %}
|
||||
{{- '}\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif message.role == "tool" %}
|
||||
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
||||
{{- '<|im_start|>user' }}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_response>\n' }}
|
||||
{{- message.content }}
|
||||
{{- '\n</tool_response>' }}
|
||||
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n' }}
|
||||
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||
{{- '<think>\n\n</think>\n\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
@@ -19,4 +19,6 @@ These templates can be updated with the following commands:
|
||||
./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
|
||||
./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
|
||||
./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
|
||||
./scripts/get_chat_template.py Qwen/QwQ-32B > models/templates/Qwen-QwQ-32B.jinja
|
||||
./scripts/get_chat_template.py Qwen/Qwen3-0.6B > models/templates/Qwen-Qwen3-0.6B.jinja
|
||||
```
|
||||
@@ -17,14 +17,14 @@ rm -f llama-bench.sqlite > /dev/null
|
||||
|
||||
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
|
||||
if [ -n "$GGML_CUDA" ]; then
|
||||
cmake_opts="-DGGML_CUDA=ON"
|
||||
CMAKE_OPTS="${CMAKE_OPTS} -DGGML_CUDA=ON"
|
||||
fi
|
||||
|
||||
dir="build-bench"
|
||||
|
||||
function run {
|
||||
rm -fr ${dir} > /dev/null
|
||||
cmake -B ${dir} -S . $cmake_opts > /dev/null
|
||||
cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
|
||||
cmake --build ${dir} -t llama-bench > /dev/null
|
||||
${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
||||
}
|
||||
|
||||
@@ -1 +1 @@
|
||||
7c06c10c532a6cda913c17fc56341e8880ae341d
|
||||
06b715f4c170232af261425240914fa49c44f982
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
|
||||
export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}
|
||||
|
||||
./scripts/tool_bench.py run --n 10 --temp -1 --temp 0 --temp 1 --temp 2 --temp 5 --llama-baseline $PWD/buildMaster/bin/llama-server --output qwen14b.jsonl --hf bartowski/Qwen2.5-14B-Instruct-GGUF:Q4_K_L
|
||||
./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 1.5B Q4_K_M" --output qwen1.5b.jsonl --hf bartowski/Qwen2.5-1.5B-Instruct-GGUF --ollama qwen2.5:1.5b-instruct-q4_K_M
|
||||
./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 Coder 7B Q4_K_M" --output qwenc7b.jsonl --hf bartowski/Qwen2.5-Coder-7B-Instruct-GGUF --ollama qwen2.5-coder:7b
|
||||
|
||||
@@ -205,6 +206,7 @@ def run(
|
||||
model: Annotated[Optional[str], typer.Option(help="Name of the model to test (server agnostic)")] = None,
|
||||
hf: Annotated[Optional[str], typer.Option(help="GGUF huggingface model repo id (+ optional quant) to test w/ llama-server")] = None,
|
||||
chat_template: Annotated[Optional[str], typer.Option(help="Chat template override for llama-server")] = None,
|
||||
chat_template_file: Annotated[Optional[str], typer.Option(help="Chat template file override for llama-server")] = None,
|
||||
ollama: Annotated[Optional[str], typer.Option(help="Ollama model tag to test")] = None,
|
||||
llama_baseline: Annotated[Optional[str], typer.Option(help="llama-server baseline binary path to use as baseline")] = None,
|
||||
n: Annotated[int, typer.Option(help="Number of times to run each test")] = 10,
|
||||
@@ -229,6 +231,12 @@ def run(
|
||||
# n_ctx = 8192
|
||||
n_ctx = 2048
|
||||
|
||||
if model is None:
|
||||
if hf is not None:
|
||||
model = hf.split("/")[-1]
|
||||
elif ollama is not None:
|
||||
model = ollama
|
||||
|
||||
assert force or append or not output.exists(), f"Output file already exists: {output}; use --force to overwrite"
|
||||
|
||||
with output.open('a' if append else 'w') as output_file:
|
||||
@@ -320,6 +328,7 @@ def run(
|
||||
server.model_hf_repo = hf
|
||||
server.model_hf_file = None
|
||||
server.chat_template = chat_template
|
||||
server.chat_template_file = chat_template_file
|
||||
server.server_path = server_path
|
||||
if port is not None:
|
||||
server.server_port = port
|
||||
@@ -335,6 +344,7 @@ def run(
|
||||
temp=t,
|
||||
output_kwargs=dict(
|
||||
chat_template=chat_template,
|
||||
chat_template_file=chat_template_file,
|
||||
),
|
||||
request_kwargs=dict(
|
||||
ignore_chat_grammar=ignore_chat_grammar,
|
||||
@@ -355,6 +365,7 @@ def run(
|
||||
temp=t,
|
||||
output_kwargs=dict(
|
||||
chat_template=None,
|
||||
chat_template_file=None,
|
||||
),
|
||||
request_kwargs=dict(
|
||||
model=ollama,
|
||||
|
||||
@@ -14,6 +14,7 @@ add_library(llama
|
||||
llama-batch.cpp
|
||||
llama-chat.cpp
|
||||
llama-context.cpp
|
||||
llama-cparams.cpp
|
||||
llama-grammar.cpp
|
||||
llama-graph.cpp
|
||||
llama-hparams.cpp
|
||||
|
||||
+19
-3
@@ -25,7 +25,11 @@ llama_context::llama_context(
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||
if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
|
||||
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
|
||||
}
|
||||
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
||||
@@ -689,12 +693,18 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
// TODO: move the validation to the llama_batch_allocr
|
||||
if (batch.token) {
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
throw -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -848,7 +858,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
int llama_context::decode(llama_batch & inp_batch) {
|
||||
if (!memory) {
|
||||
LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
|
||||
return encode(inp_batch);
|
||||
}
|
||||
|
||||
@@ -883,11 +893,17 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
// TODO: move the validation to the llama_batch_allocr
|
||||
if (batch.token) {
|
||||
for (int64_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
|
||||
throw std::runtime_error("invalid token");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1 +1,5 @@
|
||||
#include "llama-cparams.h"
|
||||
|
||||
size_t llama_max_parallel_sequences(void) {
|
||||
return LLAMA_MAX_PARALLEL_SEQUENCES;
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define LLAMA_MAX_PARALLEL_SEQUENCES 64
|
||||
|
||||
struct llama_cparams {
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
uint32_t n_batch;
|
||||
|
||||
+12
-2
@@ -1177,8 +1177,18 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
for (const auto & trigger_pattern : grammar.trigger_patterns) {
|
||||
if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
|
||||
grammar.awaiting_trigger = false;
|
||||
// get from the first match to the end of the string
|
||||
auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
|
||||
// get from the first matched capturing group to the end of the string
|
||||
size_t start = std::string::npos;
|
||||
for (auto i = 1u; i < match.size(); i++) {
|
||||
if (match.length(i) > 0) {
|
||||
start = match.position(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (start == std::string::npos) {
|
||||
start = match.position(0);
|
||||
}
|
||||
auto constrained_str = grammar.trigger_buffer.substr(start);
|
||||
// std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, constrained_str);
|
||||
|
||||
+4
-4
@@ -1287,6 +1287,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_GLM4) {
|
||||
// GLM4 seems to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
@@ -1367,10 +1371,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_GLM4) {
|
||||
// GLM4 seems to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
|
||||
+17
-1
@@ -2,6 +2,22 @@
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
}
|
||||
}
|
||||
|
||||
bool llama_hparams::is_swa_any() const {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
if (swa_layers[il]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_head(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return n_head_arr[il];
|
||||
@@ -72,7 +88,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
|
||||
|
||||
bool llama_hparams::is_swa(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return n_swa_pattern == 0 || (il % n_swa_pattern < (n_swa_pattern - 1));
|
||||
return swa_layers[il];
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
+23
-14
@@ -102,20 +102,12 @@ struct llama_hparams {
|
||||
|
||||
// Sliding Window Attention (SWA)
|
||||
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
uint32_t n_swa = 0; // the size of the sliding window (0 - no SWA)
|
||||
uint32_t n_swa_pattern = 1; // this value n means that every nth layer is dense (i.e. non-SWA)
|
||||
// by default n == 1, all layers are dense
|
||||
// note that if n_swa_pattern == 0, all layers are SWA
|
||||
// example: n_swa_pattern = 3
|
||||
// il == 0: swa
|
||||
// il == 1: swa
|
||||
// il == 2: dense
|
||||
// il == 3: swa
|
||||
// il == 4: swa
|
||||
// il == 5: dense
|
||||
// il == 6: swa
|
||||
// etc ...
|
||||
// the size of the sliding window (0 - no SWA)
|
||||
uint32_t n_swa = 0;
|
||||
// if swa_layers[il] == true, then layer il is SWA
|
||||
// if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
|
||||
// by default, all layers are dense
|
||||
std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
|
||||
|
||||
// for State Space Models
|
||||
uint32_t ssm_d_conv = 0;
|
||||
@@ -153,6 +145,23 @@ struct llama_hparams {
|
||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
||||
|
||||
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
||||
// note that if n_pattern == 0, all layers are SWA
|
||||
// if n_pattern == 1, all layers are dense
|
||||
// example: n_pattern = 3
|
||||
// il == 0: swa
|
||||
// il == 1: swa
|
||||
// il == 2: dense
|
||||
// il == 3: swa
|
||||
// il == 4: swa
|
||||
// il == 5: dense
|
||||
// il == 6: swa
|
||||
// etc ...
|
||||
void set_swa_pattern(uint32_t n_pattern);
|
||||
|
||||
// return true if one of the layers is SWA
|
||||
bool is_swa_any() const;
|
||||
|
||||
uint32_t n_head(uint32_t il = 0) const;
|
||||
|
||||
uint32_t n_head_kv(uint32_t il = 0) const;
|
||||
|
||||
+157
-245
@@ -65,8 +65,6 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
};
|
||||
|
||||
head = 0;
|
||||
size = kv_size;
|
||||
used = 0;
|
||||
|
||||
cells.resize(kv_size);
|
||||
|
||||
@@ -138,13 +136,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::clear() {
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
cells[i].pos = -1;
|
||||
cells[i].seq_id.clear();
|
||||
}
|
||||
cells.reset();
|
||||
|
||||
head = 0;
|
||||
used = 0;
|
||||
|
||||
for (auto & buf : bufs) {
|
||||
ggml_backend_buffer_clear(buf.get(), 0);
|
||||
@@ -152,7 +146,7 @@ void llama_kv_cache_unified::clear() {
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
uint32_t new_head = size;
|
||||
uint32_t new_head = cells.size();
|
||||
|
||||
if (p0 < 0) {
|
||||
p0 = 0;
|
||||
@@ -162,33 +156,20 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
||||
p1 = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (cells[i].pos >= p0 && cells[i].pos < p1) {
|
||||
if (seq_id < 0) {
|
||||
cells[i].seq_id.clear();
|
||||
} else if (cells[i].has_seq_id(seq_id)) {
|
||||
cells[i].seq_id.erase(seq_id);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cells[i].is_empty()) {
|
||||
// keep count of the number of used cells
|
||||
if (cells[i].pos >= 0) {
|
||||
used--;
|
||||
}
|
||||
|
||||
cells[i].pos = -1;
|
||||
|
||||
if (new_head == size) {
|
||||
new_head = i;
|
||||
}
|
||||
if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
|
||||
if (new_head == cells.size()) {
|
||||
new_head = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
if (new_head != size && new_head < head) {
|
||||
if (new_head != cells.size() && new_head < head) {
|
||||
head = new_head;
|
||||
}
|
||||
|
||||
@@ -208,49 +189,40 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
|
||||
p1 = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
// otherwise, this is the KV of a Transformer-like model
|
||||
head = 0;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
|
||||
cells[i].seq_id.insert(seq_id_dst);
|
||||
if (cells.seq_has(i, seq_id_src)) {
|
||||
cells.seq_add(i, seq_id_dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
|
||||
uint32_t new_head = size;
|
||||
uint32_t new_head = cells.size();
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (!cells[i].has_seq_id(seq_id)) {
|
||||
if (cells[i].pos >= 0) {
|
||||
used--;
|
||||
}
|
||||
|
||||
cells[i].pos = -1;
|
||||
cells[i].seq_id.clear();
|
||||
|
||||
if (new_head == size){
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (cells.seq_keep(i, seq_id)) {
|
||||
if (new_head == cells.size()) {
|
||||
new_head = i;
|
||||
}
|
||||
} else {
|
||||
cells[i].seq_id.clear();
|
||||
cells[i].seq_id.insert(seq_id);
|
||||
}
|
||||
}
|
||||
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
if (new_head != size && new_head < head) {
|
||||
if (new_head != cells.size() && new_head < head) {
|
||||
head = new_head;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
||||
if (delta == 0) {
|
||||
void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
||||
if (shift == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t new_head = size;
|
||||
uint32_t new_head = cells.size();
|
||||
|
||||
if (p0 < 0) {
|
||||
p0 = 0;
|
||||
@@ -260,25 +232,19 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
|
||||
p1 = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
// If there is no range then return early to avoid looping over the
|
||||
// If there is no range then return early to avoid looping over all cells.
|
||||
if (p0 == p1) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
|
||||
has_shift = true;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cells[i].pos += delta;
|
||||
cells[i].delta += delta;
|
||||
|
||||
if (cells[i].pos < 0) {
|
||||
if (!cells[i].is_empty()) {
|
||||
used--;
|
||||
}
|
||||
cells[i].pos = -1;
|
||||
cells[i].seq_id.clear();
|
||||
if (new_head == size) {
|
||||
if (cells.seq_has(i, seq_id)) {
|
||||
if (cells.pos_add(i, shift)) {
|
||||
if (new_head == cells.size()) {
|
||||
new_head = i;
|
||||
}
|
||||
}
|
||||
@@ -287,7 +253,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
|
||||
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
// Otherwise we just start the next search from the beginning.
|
||||
head = new_head != size ? new_head : 0;
|
||||
head = new_head != cells.size() ? new_head : 0;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
@@ -308,67 +274,35 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
|
||||
has_shift = true;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
{
|
||||
llama_pos p_old = cells[i].pos;
|
||||
cells[i].pos /= d;
|
||||
cells[i].delta += cells[i].pos - p_old;
|
||||
}
|
||||
if (cells.seq_has(i, seq_id)) {
|
||||
cells.pos_div(i, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
|
||||
llama_pos result = std::numeric_limits<llama_pos>::max();
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (cells[i].has_seq_id(seq_id)) {
|
||||
result = std::min(result, cells[i].pos);
|
||||
}
|
||||
}
|
||||
|
||||
if (result == std::numeric_limits<llama_pos>::max()) {
|
||||
result = -1;
|
||||
}
|
||||
|
||||
return result;
|
||||
return cells.seq_pos_min(seq_id);
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
|
||||
llama_pos result = -1;
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (cells[i].has_seq_id(seq_id)) {
|
||||
result = std::max(result, cells[i].pos);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
return cells.seq_pos_max(seq_id);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::restore() {
|
||||
for (const auto & [id, cell] : recovery.cells) {
|
||||
// TODO: move to new `struct kv_cells`
|
||||
const bool is_empty0 = cells[id].is_empty();
|
||||
const bool is_empty1 = cell.is_empty();
|
||||
|
||||
if (!is_empty0 && is_empty1) {
|
||||
used--;
|
||||
} else if (is_empty0 && !is_empty1) {
|
||||
used++;
|
||||
}
|
||||
|
||||
cells[id] = cell;
|
||||
for (auto & state : recovery.states) {
|
||||
cells.set(state.i, state.cells);
|
||||
}
|
||||
|
||||
recovery.clear();
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::commit() {
|
||||
if (recovery.cells.empty()) {
|
||||
if (recovery.states.empty()) {
|
||||
LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n",
|
||||
__func__, "https://github.com/ggml-org/llama.cpp/pull/13194");
|
||||
return;
|
||||
@@ -382,7 +316,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
|
||||
auto * sched = lctx.get_sched();
|
||||
|
||||
if (has_shift) {
|
||||
if (cells.get_has_shift()) {
|
||||
if (!get_can_shift()) {
|
||||
GGML_ABORT("The current KV cache / model configuration does not support K-shift");
|
||||
}
|
||||
@@ -406,13 +340,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
need_reserve = true;
|
||||
}
|
||||
|
||||
{
|
||||
has_shift = false;
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
cells[i].delta = 0;
|
||||
}
|
||||
}
|
||||
cells.reset_shift();
|
||||
}
|
||||
|
||||
if (do_defrag) {
|
||||
@@ -443,7 +371,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
void llama_kv_cache_unified::defrag_sched(float thold) {
|
||||
// - do not defrag small contexts (i.e. < 2048 tokens)
|
||||
// - count the padding towards the number of used tokens
|
||||
const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
|
||||
const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n)) : 0.0f;
|
||||
|
||||
// queue defragmentation for next llama_kv_cache_update
|
||||
if (fragmentation > thold) {
|
||||
@@ -454,7 +382,7 @@ void llama_kv_cache_unified::defrag_sched(float thold) {
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::set_full() {
|
||||
n = size;
|
||||
n = cells.size();
|
||||
|
||||
// when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
|
||||
// affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
|
||||
@@ -478,14 +406,14 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
||||
|
||||
// if we have enough unused cells before the current head ->
|
||||
// better to start searching from the beginning of the cache, hoping to fill it
|
||||
if (head > used + 2*ubatch.n_tokens) {
|
||||
if (head > cells.get_used() + 2*ubatch.n_tokens) {
|
||||
head = 0;
|
||||
}
|
||||
|
||||
// otherwise, one cell per token.
|
||||
|
||||
if (n_tokens > size) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
|
||||
if (n_tokens > cells.size()) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -498,10 +426,10 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
||||
std::string ss;
|
||||
if (n_swa > 0) {
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (cells[i].pos == -1) {
|
||||
if (cells.is_empty(i)) {
|
||||
ss += '.';
|
||||
} else {
|
||||
ss += std::to_string(*cells[i].seq_id.begin());
|
||||
ss += 'x';
|
||||
}
|
||||
if (i%256 == 255) {
|
||||
ss += '\n';
|
||||
@@ -515,15 +443,16 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
while (true) {
|
||||
if (head + n_tokens > size) {
|
||||
n_tested += size - head;
|
||||
if (head + n_tokens > cells.size()) {
|
||||
n_tested += cells.size() - head;
|
||||
head = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
bool found = true;
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
if (cells[head + i].pos >= 0) {
|
||||
// TODO: improve to accept cells that are masked by the SWA
|
||||
if (!cells.is_empty(head + i)) {
|
||||
found = false;
|
||||
head += i + 1;
|
||||
n_tested += i + 1;
|
||||
@@ -535,31 +464,27 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_tested >= size) {
|
||||
if (n_tested >= cells.size()) {
|
||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
// remember the original state
|
||||
if (recovery.cells.find(head + i) == recovery.cells.end()) {
|
||||
recovery.cells[head + i] = cells[head + i];
|
||||
}
|
||||
// store the old state of the cells in the recovery stack
|
||||
recovery.states.push_back({head, cells.cp(head, n_tokens)});
|
||||
|
||||
cells[head + i].pos = ubatch.pos[i];
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
cells.pos_set(head + i, ubatch.pos[i]);
|
||||
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
|
||||
cells[head + i].seq_id.insert(ubatch.seq_id[i][j]);
|
||||
cells.seq_add(head + i, ubatch.seq_id[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
used += n_tokens;
|
||||
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
|
||||
n = std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
|
||||
|
||||
#ifdef FIND_SLOT_DEBUG
|
||||
LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
|
||||
@@ -577,7 +502,7 @@ uint32_t llama_kv_cache_unified::get_n() const {
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache_unified::get_size() const {
|
||||
return size;
|
||||
return cells.size();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {
|
||||
@@ -661,30 +586,19 @@ void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llam
|
||||
|
||||
int n_attended = 0;
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
const llama_pos p0 = cells[i].pos;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.seq_has(i, seq_id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_pos p0 = cells.pos_get(i);
|
||||
|
||||
if (p0 <= pmin && !is_masked_swa(p0, pmin)) {
|
||||
n_attended++;
|
||||
}
|
||||
|
||||
if (is_masked_swa(p0, pmax)) {
|
||||
if (seq_id < 0) {
|
||||
cells[i].seq_id.clear();
|
||||
} else if (cells[i].has_seq_id(seq_id)) {
|
||||
cells[i].seq_id.erase(seq_id);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cells[i].is_empty()) {
|
||||
// keep count of the number of used cells
|
||||
if (cells[i].pos >= 0) {
|
||||
used--;
|
||||
}
|
||||
|
||||
cells[i].pos = -1;
|
||||
}
|
||||
cells.seq_rm(i, seq_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -723,25 +637,31 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||
const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
const llama_pos p0 = cells[i].pos;
|
||||
float f = 0.0f;
|
||||
|
||||
bool masked = false;
|
||||
|
||||
// mask the token if not the same sequence
|
||||
masked = masked || (!cells[i].has_seq_id(seq_id));
|
||||
if (cells.is_empty(i)) {
|
||||
masked = true;
|
||||
} else {
|
||||
const llama_pos p0 = cells.pos_get(i);
|
||||
|
||||
// mask future tokens
|
||||
masked = masked || (causal_attn && p0 > p1);
|
||||
// mask the token if not the same sequence
|
||||
masked = masked || (!cells.seq_has(i, seq_id));
|
||||
|
||||
// apply SWA if any
|
||||
masked = masked || (is_masked_swa(p0, p1));
|
||||
// mask future tokens
|
||||
masked = masked || (causal_attn && p0 > p1);
|
||||
|
||||
float f = 0.0f;
|
||||
// apply SWA if any
|
||||
masked = masked || (is_masked_swa(p0, p1));
|
||||
|
||||
if (!masked && hparams.use_alibi) {
|
||||
f = -std::abs(p0 - p1);
|
||||
}
|
||||
}
|
||||
|
||||
if (masked) {
|
||||
f = -INFINITY;
|
||||
} else if (hparams.use_alibi) {
|
||||
f = -std::abs(p0 - p1);
|
||||
}
|
||||
|
||||
data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
||||
@@ -765,8 +685,8 @@ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
|
||||
|
||||
int32_t * data = (int32_t *) dst->data;
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
data[i] = cells[i].delta;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -783,7 +703,10 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
|
||||
for (int h = 0; h < 1; ++h) {
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
|
||||
// the position when the cells is empty is irrelevant - it will be masked out later in the attention
|
||||
const llama_pos p0 = cells.is_empty(i) ? -1 : cells.pos_get(i);
|
||||
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(p0, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -910,7 +833,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
|
||||
ggml_tensor * k =
|
||||
ggml_view_3d(ctx, layer.k,
|
||||
n_embd_head_k, n_head_kv, size,
|
||||
n_embd_head_k, n_head_kv, cells.size(),
|
||||
ggml_row_size(layer.k->type, n_embd_head_k),
|
||||
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
||||
0);
|
||||
@@ -1050,12 +973,12 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx, layer.v,
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(layer.v->type, size),
|
||||
ggml_row_size(layer.v->type, cells.size()),
|
||||
ggml_row_size(layer.v->type, i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx, layer.v,
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(layer.v->type, size),
|
||||
ggml_row_size(layer.v->type, cells.size()),
|
||||
ggml_row_size(layer.v->type, id));
|
||||
}
|
||||
|
||||
@@ -1075,8 +998,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
const uint32_t n_layer = layers.size();
|
||||
|
||||
const uint32_t n_kv = cell_max();
|
||||
const uint32_t n_used = used;
|
||||
const uint32_t n_kv = cells.used_max_p1();
|
||||
const uint32_t n_used = cells.get_used();
|
||||
|
||||
assert(n_used <= n_kv);
|
||||
|
||||
@@ -1104,9 +1027,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
ids.resize(n_kv, n_kv);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
||||
const auto & cell0 = cells[i0];
|
||||
|
||||
if (!cell0.is_empty()) {
|
||||
if (!cells.is_empty(i0)) {
|
||||
ids[i0] = i0;
|
||||
|
||||
continue;
|
||||
@@ -1117,7 +1038,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
uint32_t nh = 1;
|
||||
|
||||
// determine the size of the hole
|
||||
while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
|
||||
while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
|
||||
nh++;
|
||||
}
|
||||
|
||||
@@ -1126,9 +1047,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
|
||||
// starting from the end, find nh non-empty cells
|
||||
for (; is > i0; --is) {
|
||||
const auto & cell1 = cells[is];
|
||||
|
||||
if (cell1.is_empty() || ids[is] != n_kv) {
|
||||
if (cells.is_empty(is) || ids[is] != n_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1155,9 +1074,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
if (cells.is_empty(i1) || ids[i1] != n_kv) {
|
||||
if (n_moves == max_moves) {
|
||||
stop = true;
|
||||
break;
|
||||
@@ -1171,10 +1088,8 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
ids[i1] = i0 + nf;
|
||||
|
||||
// move the cell meta data
|
||||
cells[i0 + nf] = cell1;
|
||||
cells.mv(i1, i0 + nf);
|
||||
|
||||
// clear the old cell and move the head there
|
||||
cell1 = kv_cell();
|
||||
head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
@@ -1209,22 +1124,8 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache_unified::cell_max() const {
|
||||
for (uint32_t i = size; i > 0; --i) {
|
||||
const kv_cell & cell = cells[i - 1];
|
||||
|
||||
if (cell.pos >= 0 && !cell.is_empty()) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
||||
if (p0 < 0) {
|
||||
return true;
|
||||
}
|
||||
assert(p0 >= 0 && p1 >= 0);
|
||||
|
||||
switch (swa_type) {
|
||||
case LLAMA_SWA_TYPE_NONE:
|
||||
@@ -1255,23 +1156,24 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
|
||||
|
||||
// Count the number of cells with the specified seq_id
|
||||
// Find all the ranges of cells with this seq id (or all, when -1)
|
||||
uint32_t cell_range_begin = size;
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
const auto & cell = cells[i];
|
||||
if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
|
||||
uint32_t cell_range_begin = cells.size();
|
||||
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
|
||||
++cell_count;
|
||||
if (cell_range_begin == size) {
|
||||
if (cell_range_begin == cells.size()) {
|
||||
cell_range_begin = i;
|
||||
}
|
||||
} else {
|
||||
if (cell_range_begin != size) {
|
||||
if (cell_range_begin != cells.size()) {
|
||||
cell_ranges.emplace_back(cell_range_begin, i);
|
||||
cell_range_begin = size;
|
||||
cell_range_begin = cells.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cell_range_begin != size) {
|
||||
cell_ranges.emplace_back(cell_range_begin, size);
|
||||
|
||||
if (cell_range_begin != cells.size()) {
|
||||
cell_ranges.emplace_back(cell_range_begin, cells.size());
|
||||
}
|
||||
|
||||
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
||||
@@ -1308,17 +1210,24 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
|
||||
void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
|
||||
for (const auto & range : cell_ranges) {
|
||||
for (uint32_t i = range.first; i < range.second; ++i) {
|
||||
const auto & cell = cells[i];
|
||||
const llama_pos pos = cell.pos;
|
||||
const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
|
||||
std::vector<llama_seq_id> seq_ids;
|
||||
|
||||
for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
|
||||
if (cur == seq_id || seq_id == -1) {
|
||||
if (cells.seq_has(i, cur)) {
|
||||
seq_ids.push_back(cur);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const llama_pos pos = cells.pos_get(i);
|
||||
const uint32_t n_seq_id = seq_ids.size();
|
||||
|
||||
io.write(&pos, sizeof(pos));
|
||||
io.write(&n_seq_id, sizeof(n_seq_id));
|
||||
|
||||
if (n_seq_id) {
|
||||
for (auto seq_id : cell.seq_id) {
|
||||
io.write(&seq_id, sizeof(seq_id));
|
||||
}
|
||||
for (const auto & seq_id : seq_ids) {
|
||||
io.write(&seq_id, sizeof(seq_id));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1379,7 +1288,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
|
||||
}
|
||||
} else {
|
||||
// When v is transposed, we also need the element size and get the element ranges from each row
|
||||
const uint32_t kv_size = size;
|
||||
const uint32_t kv_size = cells.size();
|
||||
|
||||
for (const auto & layer : layers) {
|
||||
const uint32_t il = layer.il;
|
||||
@@ -1429,14 +1338,20 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
io.read_to(&pos, sizeof(pos));
|
||||
io.read_to(&n_seq_id, sizeof(n_seq_id));
|
||||
|
||||
if (n_seq_id != 0) {
|
||||
if (n_seq_id != 1) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
batch.pos[i] = pos;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id[i] = &dest_seq_id;
|
||||
// read the sequence id, but directly discard it - we will use dest_seq_id instead
|
||||
{
|
||||
llama_seq_id seq_id;
|
||||
io.read_to(&seq_id, sizeof(seq_id));
|
||||
}
|
||||
|
||||
batch.pos[i] = pos;
|
||||
batch.n_seq_id[i] = n_seq_id;
|
||||
batch.seq_id[i] = &dest_seq_id;
|
||||
}
|
||||
|
||||
if (!find_slot(batch)) {
|
||||
@@ -1448,15 +1363,15 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
|
||||
// DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||
// Assume that this is one contiguous block of cells
|
||||
GGML_ASSERT(head + cell_count <= size);
|
||||
GGML_ASSERT(cells[head].pos == batch.pos[0]);
|
||||
GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
|
||||
GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
|
||||
GGML_ASSERT(head + cell_count <= cells.size());
|
||||
GGML_ASSERT(cells.pos_get(head) == batch.pos[0]);
|
||||
GGML_ASSERT(cells.pos_get(head + cell_count - 1) == batch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(cells.seq_has(head, dest_seq_id));
|
||||
GGML_ASSERT(cells.seq_has(head + cell_count - 1, dest_seq_id));
|
||||
} else {
|
||||
// whole KV cache restore
|
||||
|
||||
if (cell_count > size) {
|
||||
if (cell_count > cells.size()) {
|
||||
LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -1464,15 +1379,13 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
clear();
|
||||
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
kv_cell & cell = cells[i];
|
||||
|
||||
llama_pos pos;
|
||||
uint32_t n_seq_id;
|
||||
|
||||
io.read_to(&pos, sizeof(pos));
|
||||
io.read_to(&n_seq_id, sizeof(n_seq_id));
|
||||
|
||||
cell.pos = pos;
|
||||
cells.pos_set(i, pos);
|
||||
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
llama_seq_id seq_id;
|
||||
@@ -1483,12 +1396,11 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
return false;
|
||||
}
|
||||
|
||||
cell.seq_id.insert(seq_id);
|
||||
cells.seq_add(i, seq_id);
|
||||
}
|
||||
}
|
||||
|
||||
head = 0;
|
||||
used = cell_count;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -1505,8 +1417,8 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
|
||||
return false;
|
||||
}
|
||||
if (cell_count > size) {
|
||||
LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
|
||||
if (cell_count > cells.size()) {
|
||||
LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
|
||||
return false;
|
||||
}
|
||||
if (this->v_trans != (bool) v_trans) {
|
||||
@@ -1609,7 +1521,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
if (cell_count) {
|
||||
// For each row in the transposed matrix, read the values for the whole cell range
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
const size_t dst_offset = (head + j * size) * v_size_el;
|
||||
const size_t dst_offset = (head + j * cells.size()) * v_size_el;
|
||||
ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
||||
}
|
||||
}
|
||||
@@ -1689,9 +1601,9 @@ void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
|
||||
kv_swa ->seq_keep(seq_id);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
||||
kv_base->seq_add(seq_id, p0, p1, delta);
|
||||
kv_swa ->seq_add(seq_id, p0, p1, delta);
|
||||
void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
||||
kv_base->seq_add(seq_id, p0, p1, shift);
|
||||
kv_swa ->seq_add(seq_id, p0, p1, shift);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
@@ -2063,8 +1975,8 @@ void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
||||
if (delta == 0) {
|
||||
void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
||||
if (shift == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -2087,7 +1999,7 @@ void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_
|
||||
if (tail_id >= 0) {
|
||||
kv_cell & cell = cells[tail_id];
|
||||
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
|
||||
cell.pos += delta;
|
||||
cell.pos += shift;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+21
-34
@@ -4,6 +4,7 @@
|
||||
#include "llama-io.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-memory.h"
|
||||
#include "llama-kv-cells.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
@@ -35,6 +36,7 @@ struct llama_kv_cache : public llama_memory_i {
|
||||
virtual void defrag_sched(float thold) = 0;
|
||||
|
||||
// simulate full cache, used for allocating worst-case compute buffers
|
||||
// TODO: remove
|
||||
virtual void set_full() = 0;
|
||||
|
||||
//
|
||||
@@ -42,7 +44,7 @@ struct llama_kv_cache : public llama_memory_i {
|
||||
//
|
||||
|
||||
// =============================================================================================================
|
||||
// TODO: refactor and simplify this
|
||||
// TODO: refactor and simplify this [TAG: KV_API]
|
||||
|
||||
virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
|
||||
|
||||
@@ -121,7 +123,7 @@ public:
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
|
||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
@@ -159,7 +161,7 @@ public:
|
||||
// llama_kv_cache_unified specific API
|
||||
//
|
||||
|
||||
uint32_t get_n() const;
|
||||
uint32_t get_n() const;
|
||||
uint32_t get_size() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
@@ -180,26 +182,6 @@ private:
|
||||
const llama_model & model;
|
||||
const llama_hparams & hparams;
|
||||
|
||||
struct kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
|
||||
// TODO: replace with bitset uint64_t
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
};
|
||||
|
||||
struct kv_layer {
|
||||
// layer index in the model
|
||||
// note: can be different from the layer index in the KV cache
|
||||
@@ -209,15 +191,13 @@ private:
|
||||
ggml_tensor * v;
|
||||
};
|
||||
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
|
||||
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
||||
uint32_t size = 0; // total number of cells, shared across all sequences
|
||||
uint32_t used = 0; // used cells (i.e. at least one seq_id) (TODO: add `struct kv_cells` and keep track automaticallt)
|
||||
|
||||
// computed before each graph build
|
||||
// TODO: cells should start to maintain this value dynamically based on the edits
|
||||
uint32_t n = 0;
|
||||
|
||||
const uint32_t n_seq_max = 1;
|
||||
@@ -233,19 +213,29 @@ private:
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
std::vector<kv_cell> cells; // TODO: replace with `struct kv_cells`
|
||||
llama_kv_cells_unified cells;
|
||||
|
||||
std::vector<kv_layer> layers;
|
||||
|
||||
// model layer id -> KV cache layer id
|
||||
std::unordered_map<int32_t, int32_t> map_layer_ids;
|
||||
|
||||
// recovery information used to restore the KV cells to their original state in case of a failure
|
||||
// TODO: do not store as a state in the llama_kv_cache object, instead return upon batch preparation
|
||||
// to achieve that, first need to refactor the llama_kv_cache interface [TAG: KV_API]
|
||||
struct {
|
||||
void clear() {
|
||||
cells.clear();
|
||||
states.clear();
|
||||
}
|
||||
|
||||
std::unordered_map<uint32_t, kv_cell> cells;
|
||||
struct state {
|
||||
uint32_t i;
|
||||
|
||||
llama_kv_cells_unified cells;
|
||||
};
|
||||
|
||||
// stack with the partial states before each ubatch
|
||||
std::vector<state> states;
|
||||
} recovery;
|
||||
|
||||
// defrag
|
||||
@@ -256,9 +246,6 @@ private:
|
||||
// return true if cells have been moved
|
||||
bool defrag_prepare(int32_t n_max_nodes);
|
||||
|
||||
// find how many cells are currently in use
|
||||
uint32_t cell_max() const;
|
||||
|
||||
size_t total_size() const;
|
||||
|
||||
size_t size_k_bytes() const;
|
||||
@@ -325,7 +312,7 @@ public:
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
|
||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
@@ -431,7 +418,7 @@ public:
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
|
||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
|
||||
@@ -0,0 +1,379 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-cparams.h"
|
||||
|
||||
#include <bitset>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
// meta information about KV cells that can be part of multiple sequences at the same time
|
||||
// TODO: add unit tests
|
||||
class llama_kv_cells_unified {
|
||||
public:
|
||||
void reset() {
|
||||
for (uint32_t i = 0; i < pos.size(); ++i) {
|
||||
pos[i] = -1;
|
||||
shift[i] = 0;
|
||||
seq[i].reset();
|
||||
}
|
||||
|
||||
has_shift = false;
|
||||
|
||||
used.clear();
|
||||
|
||||
for (uint32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
seq_pos[s].clear();
|
||||
}
|
||||
}
|
||||
|
||||
void reset_shift() {
|
||||
has_shift = false;
|
||||
|
||||
for (uint32_t i = 0; i < shift.size(); ++i) {
|
||||
shift[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t size() const {
|
||||
return pos.size();
|
||||
}
|
||||
|
||||
void resize(uint32_t n) {
|
||||
pos.resize(n);
|
||||
shift.resize(n);
|
||||
seq.resize(n);
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
bool is_empty(uint32_t i) const {
|
||||
assert(i < pos.size());
|
||||
assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
|
||||
|
||||
return pos[i] == -1;
|
||||
}
|
||||
|
||||
uint32_t get_used() const {
|
||||
return used.size();
|
||||
}
|
||||
|
||||
// the index of the first cell that is used
|
||||
// return 0 if no cells are used
|
||||
uint32_t used_min() const {
|
||||
return used.empty() ? 0 : *used.begin();
|
||||
}
|
||||
|
||||
// the index of the last cell that is used + 1
|
||||
// return 0 if no cells are used
|
||||
uint32_t used_max_p1() const {
|
||||
#if 0
|
||||
if (!seq_pos[0].empty()) printf("kv_cells: min[0] = %5d, max[0] = %5d\n", *seq_pos[0].begin(), *seq_pos[0].rbegin());
|
||||
if (!seq_pos[1].empty()) printf("kv_cells: min[1] = %5d, max[1] = %5d\n", *seq_pos[1].begin(), *seq_pos[1].rbegin());
|
||||
if (!seq_pos[2].empty()) printf("kv_cells: min[2] = %5d, max[2] = %5d\n", *seq_pos[2].begin(), *seq_pos[2].rbegin());
|
||||
#endif
|
||||
|
||||
return used.empty() ? 0 : *used.rbegin() + 1;
|
||||
}
|
||||
|
||||
bool get_has_shift() const {
|
||||
return has_shift;
|
||||
}
|
||||
|
||||
// move cell isrc to idst (used during defrag)
|
||||
void mv(uint32_t isrc, uint32_t idst) {
|
||||
assert(isrc < pos.size());
|
||||
assert(idst < pos.size());
|
||||
|
||||
pos [idst] = pos [isrc];
|
||||
shift[idst] = shift[isrc];
|
||||
seq [idst] = seq [isrc];
|
||||
|
||||
pos [isrc] = -1;
|
||||
shift[isrc] = 0;
|
||||
seq [isrc].reset();
|
||||
|
||||
used.erase (isrc);
|
||||
used.insert(idst);
|
||||
}
|
||||
|
||||
// copy the state of cells [i, i + n) (used for save/restore the state of the cells)
|
||||
llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
|
||||
assert(i + n <= pos.size());
|
||||
|
||||
llama_kv_cells_unified res;
|
||||
|
||||
res.resize(n);
|
||||
|
||||
for (uint32_t j = 0; j < n; ++j) {
|
||||
res.pos[j] = pos[i + j];
|
||||
res.seq[j] = seq[i + j];
|
||||
|
||||
assert(shift[i + j] == 0);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
|
||||
void set(uint32_t i, const llama_kv_cells_unified & other) {
|
||||
assert(i + other.pos.size() <= pos.size());
|
||||
|
||||
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
||||
if (pos[i + j] == -1 && other.pos[j] != -1) {
|
||||
used.insert(i + j);
|
||||
}
|
||||
|
||||
if (pos[i + j] != -1 && other.pos[j] == -1) {
|
||||
used.erase(i + j);
|
||||
}
|
||||
|
||||
if (pos[i + j] != -1) {
|
||||
seq_pos_rm(i + j);
|
||||
}
|
||||
|
||||
pos[i + j] = other.pos[j];
|
||||
seq[i + j] = other.seq[j];
|
||||
|
||||
if (pos[i + j] != -1) {
|
||||
seq_pos_add(i + j);
|
||||
}
|
||||
|
||||
assert(shift[i + j] == 0);
|
||||
}
|
||||
}
|
||||
|
||||
// note: call only if the cell has seq_id
|
||||
// return true if the cell becomes empty
|
||||
bool seq_rm(uint32_t i, llama_seq_id seq_id) {
|
||||
assert(i < pos.size());
|
||||
assert(seq[i].test(seq_id));
|
||||
assert(pos[i] != -1);
|
||||
assert(seq_id >= 0);
|
||||
|
||||
seq[i].reset(seq_id);
|
||||
seq_pos[seq_id].erase(pos[i]);
|
||||
|
||||
if (seq[i].none()) {
|
||||
pos[i] = -1;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
|
||||
bool seq_keep(uint32_t i, llama_seq_id seq_id) {
|
||||
assert(i < pos.size());
|
||||
|
||||
if (seq[i].test(seq_id)) {
|
||||
seq_pos_rm(i);
|
||||
seq[i].reset();
|
||||
|
||||
seq[i].set(seq_id);
|
||||
seq_pos[seq_id].insert(pos[i]);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if (seq[i].any()) {
|
||||
seq_pos_rm(i);
|
||||
seq[i].reset();
|
||||
|
||||
pos[i] = -1;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
assert(pos[i] == -1);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool seq_has(uint32_t i, llama_seq_id seq_id) const {
|
||||
assert(i < pos.size());
|
||||
assert(seq_id >= 0);
|
||||
|
||||
return seq[i].test(seq_id);
|
||||
}
|
||||
|
||||
// note: call only if the cell is not empty and the seq_id is not in the cell
|
||||
void seq_add(uint32_t i, llama_seq_id seq_id) {
|
||||
assert(i < pos.size());
|
||||
assert(pos[i] != -1);
|
||||
assert(!seq[i].test(seq_id));
|
||||
|
||||
seq[i].set(seq_id);
|
||||
seq_pos[seq_id].insert(pos[i]);
|
||||
}
|
||||
|
||||
// the minimum position of sequence seq_id currently present in any of the cells
|
||||
// return -1 if the sequence is not present
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const {
|
||||
assert(seq_id >= 0);
|
||||
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
|
||||
if (seq_pos[seq_id].empty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return *seq_pos[seq_id].begin();
|
||||
}
|
||||
|
||||
// the maximum position of sequence seq_id currently present in any of the cells
|
||||
// return -1 if the sequence is not present
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const {
|
||||
assert(seq_id >= 0);
|
||||
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
|
||||
if (seq_pos[seq_id].empty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return *seq_pos[seq_id].rbegin();
|
||||
}
|
||||
|
||||
// note: call only if the cell is not empty
|
||||
llama_pos pos_get(uint32_t i) const {
|
||||
assert(i < pos.size());
|
||||
assert(pos[i] != -1);
|
||||
|
||||
return pos[i];
|
||||
}
|
||||
|
||||
// note: call only if the cell is not empty
|
||||
llama_pos get_shift(uint32_t i) const {
|
||||
assert(i < pos.size());
|
||||
assert(pos[i] != -1);
|
||||
|
||||
return shift[i];
|
||||
}
|
||||
|
||||
// check if a cell is not empty and its position is within [p0, p1)
|
||||
bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
|
||||
assert(i < pos.size());
|
||||
|
||||
return pos[i] >= p0 && pos[i] < p1;
|
||||
}
|
||||
|
||||
// set the position of an empty cell
|
||||
// does not modify "has_shift"
|
||||
// note: call only if the cell is empty
|
||||
void pos_set(uint32_t i, llama_pos p) {
|
||||
assert(i < pos.size());
|
||||
assert(pos[i] == -1);
|
||||
|
||||
pos[i] = p;
|
||||
|
||||
used.insert(i);
|
||||
}
|
||||
|
||||
// pos[i] = pos[i] + d
|
||||
// sets "has_shift" to true
|
||||
// note: call only if the cell is not empty
|
||||
bool pos_add(uint32_t i, llama_pos d) {
|
||||
assert(i < pos.size());
|
||||
assert(pos[i] != -1);
|
||||
|
||||
seq_pos_rm(i);
|
||||
|
||||
pos[i] += d;
|
||||
shift[i] += d;
|
||||
|
||||
seq_pos_add(i);
|
||||
|
||||
has_shift = true;
|
||||
|
||||
if (pos[i] < 0) {
|
||||
seq_pos_rm(i);
|
||||
|
||||
seq[i].reset();
|
||||
pos[i] = -1;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// pos[i] = pos[i] / d
|
||||
// sets "has_shift" to true
|
||||
// note: call only if the cell is not empty
|
||||
void pos_div(uint32_t i, int d) {
|
||||
assert(i < pos.size());
|
||||
assert(pos[i] != -1);
|
||||
|
||||
const llama_pos p_old = pos[i];
|
||||
|
||||
seq_pos_rm(i);
|
||||
|
||||
pos[i] /= d;
|
||||
shift[i] += p_old - pos[i];
|
||||
|
||||
seq_pos_add(i);
|
||||
|
||||
has_shift = true;
|
||||
}
|
||||
|
||||
private:
|
||||
bool has_shift = false;
|
||||
|
||||
// set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
|
||||
std::set<uint32_t> used;
|
||||
|
||||
std::vector<llama_pos> pos;
|
||||
|
||||
// this array accumulates any applied shifts to the pos array since the last reset_shift() call
|
||||
// this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
|
||||
//
|
||||
// cells.pos_add(x, shift_x);
|
||||
// cells.pos_div(y, shift_y);
|
||||
// ...
|
||||
//
|
||||
// if (cells.has_shift()) {
|
||||
// for (int i = 0; i < n; ++i) {
|
||||
// auto shift_i = cells.get_shift(i);
|
||||
// ...
|
||||
// }
|
||||
// cells.reset_shift();
|
||||
// }
|
||||
//
|
||||
std::vector<llama_pos> shift;
|
||||
|
||||
using bits_t = std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>;
|
||||
|
||||
// the bitset seq[i] tells us which sequences are currently occupying the i-th cell
|
||||
std::vector<bits_t> seq;
|
||||
|
||||
// the set seq_pos[s] tells us which positions are currently present for sequence s
|
||||
// this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
|
||||
std::set<llama_pos> seq_pos[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
|
||||
// helper functions for updating `seq_pos`, once cell at a time:
|
||||
|
||||
// remove cell i
|
||||
void seq_pos_rm(uint32_t i) {
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
if (seq[i].test(s)) {
|
||||
seq_pos[s].erase(pos[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add cell i
|
||||
void seq_pos_add(uint32_t i) {
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
if (seq[i].test(s)) {
|
||||
seq_pos[s].insert(pos[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
+1
-1
@@ -22,7 +22,7 @@ public:
|
||||
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
||||
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
||||
virtual void seq_keep(llama_seq_id seq_id) = 0;
|
||||
virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0;
|
||||
virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) = 0;
|
||||
virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
|
||||
|
||||
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
||||
|
||||
+17
-10
@@ -463,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
GGML_ASSERT(hparams.n_expert_used == 0);
|
||||
}
|
||||
|
||||
// zero-out the array hparams
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
|
||||
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
||||
|
||||
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
|
||||
|
||||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||
|
||||
@@ -574,7 +577,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
||||
hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
||||
hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
|
||||
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
||||
|
||||
switch (hparams.n_expert) {
|
||||
case 16: type = LLM_TYPE_17B_16E; break;
|
||||
@@ -863,7 +866,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
hparams.n_swa = 0;
|
||||
hparams.n_swa_pattern = 1;
|
||||
hparams.set_swa_pattern(1);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PHIMOE:
|
||||
@@ -935,7 +938,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096; // default value of gemma 2
|
||||
hparams.n_swa_pattern = 2;
|
||||
hparams.set_swa_pattern(2);
|
||||
hparams.attn_soft_cap = true;
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
@@ -953,7 +956,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa_pattern = 6;
|
||||
hparams.set_swa_pattern(6);
|
||||
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
@@ -1038,7 +1041,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_COHERE2:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa_pattern = 4;
|
||||
hparams.set_swa_pattern(4);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
@@ -2486,7 +2489,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
@@ -4320,7 +4327,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
||||
LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
|
||||
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
||||
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
||||
@@ -13216,7 +13223,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
GGML_ASSERT(hparams.n_swa_pattern != 1);
|
||||
GGML_ASSERT(hparams.is_swa_any());
|
||||
|
||||
res = new llama_kv_cache_unified_iswa(
|
||||
*this,
|
||||
@@ -13230,7 +13237,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
cparams.n_batch,
|
||||
padding);
|
||||
} else {
|
||||
GGML_ASSERT(hparams.n_swa_pattern == 1);
|
||||
GGML_ASSERT(!hparams.is_swa_any());
|
||||
|
||||
res = new llama_kv_cache_unified(
|
||||
*this,
|
||||
|
||||
@@ -798,7 +798,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||
}
|
||||
|
||||
// if we have enough values the operation was a success
|
||||
if (filtered_tokens.size() >= ctx->min_keep) {
|
||||
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
|
||||
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||
cur_p->size = filtered_tokens.size();
|
||||
min_p_applied = true;
|
||||
@@ -909,7 +909,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
|
||||
cum_sum += cur_p->data[idx].p;
|
||||
|
||||
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
||||
if (cum_sum > ctx->p && i >= ctx->min_keep - 1) {
|
||||
if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) {
|
||||
last_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
|
||||
+4
-4
@@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session {
|
||||
}
|
||||
|
||||
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
|
||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
|
||||
// at the beginning tokenization score is zero
|
||||
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
|
||||
|
||||
@@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session {
|
||||
const double challenger_score = current_best.score_sum + token_score;
|
||||
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||
if (challenger_score > current_champ.score_sum) {
|
||||
struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
|
||||
struct best_tokenization challenger = { token_id, input_offset, challenger_score };
|
||||
current_champ = challenger;
|
||||
}
|
||||
}
|
||||
@@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session {
|
||||
prefix_offset = input_offset + n_utf8_code_units;
|
||||
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||
if (challenger_score > current_champ.score_sum) {
|
||||
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
|
||||
struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
|
||||
current_champ = challenger;
|
||||
}
|
||||
}
|
||||
@@ -1007,7 +1007,7 @@ private:
|
||||
struct best_tokenization {
|
||||
llama_token token_id;
|
||||
size_t input_offset;
|
||||
float score_sum;
|
||||
double score_sum;
|
||||
};
|
||||
|
||||
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
|
||||
|
||||
@@ -92,6 +92,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
@@ -142,8 +143,10 @@ if (NOT WIN32)
|
||||
# llama_build_and_test(test-double-float.cpp) # SLOW
|
||||
endif()
|
||||
|
||||
llama_build_and_test(test-log.cpp)
|
||||
llama_build_and_test(test-chat-parser.cpp)
|
||||
llama_build_and_test(test-chat-template.cpp)
|
||||
llama_build_and_test(test-json-partial.cpp)
|
||||
llama_build_and_test(test-log.cpp)
|
||||
llama_build_and_test(test-regex-partial.cpp)
|
||||
|
||||
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
|
||||
|
||||
@@ -0,0 +1,355 @@
|
||||
// Tests chat handling, including grammar generation and parsing for tool calling, for various templates.
|
||||
//
|
||||
// Also acts as a CLI to generate a Markdown summary of the formats of Jinja templates,
|
||||
// e.g. given Minja (http://github.com/google/minja) checked out in parent dir:
|
||||
//
|
||||
// cmake -B build && cmake --build build --parallel && ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
|
||||
//
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <json.hpp>
|
||||
#include <string>
|
||||
|
||||
#include "chat-parser.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "regex-partial.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
template <class T>
|
||||
static void assert_equals(const T & expected, const T & actual) {
|
||||
if (expected != actual) {
|
||||
std::cerr << "Expected: " << expected << std::endl;
|
||||
std::cerr << "Actual: " << actual << std::endl;
|
||||
std::cerr << std::flush;
|
||||
throw std::runtime_error("Test failed");
|
||||
}
|
||||
}
|
||||
static void assert_equals(const char * expected, const std::string & actual) {
|
||||
return assert_equals<std::string>(expected, actual);
|
||||
}
|
||||
|
||||
static void assert_throws(const std::function<void()> & fn, const std::string & expected_exception_pattern = "") {
|
||||
try {
|
||||
fn();
|
||||
} catch (const std::exception & e) {
|
||||
if (expected_exception_pattern.empty()) {
|
||||
return;
|
||||
}
|
||||
std::regex expected_exception_regex(expected_exception_pattern);
|
||||
std::string actual_message = e.what();
|
||||
if (std::regex_search(actual_message, expected_exception_regex)) {
|
||||
return;
|
||||
}
|
||||
throw std::runtime_error("Exception doesn't match expected pattern: " + actual_message + " (pattern: " + expected_exception_pattern + ")");
|
||||
throw std::runtime_error("Exception of unexpected type: " + std::string(e.what()));
|
||||
}
|
||||
throw std::runtime_error("Exception was expected but not thrown");
|
||||
}
|
||||
|
||||
static void test_reasoning() {
|
||||
{
|
||||
common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
});
|
||||
assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
|
||||
assert_equals("<tnk>Cogito</tnk>Ergo sum", builder.consume_rest());
|
||||
}
|
||||
{
|
||||
common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
});
|
||||
assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
|
||||
assert_equals(std::string("Cogito"), builder.result().reasoning_content);
|
||||
assert_equals("Ergo sum", builder.consume_rest());
|
||||
}
|
||||
{
|
||||
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
});
|
||||
assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
|
||||
assert_equals("Cogito</tnk>Ergo sum", builder.consume_rest());
|
||||
}
|
||||
{
|
||||
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
});
|
||||
assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
|
||||
assert_equals(std::string("Cogito"), builder.result().reasoning_content);
|
||||
assert_equals("Ergo sum", builder.consume_rest());
|
||||
}
|
||||
{
|
||||
common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ true,
|
||||
/* .thinking_forced_open = */ true,
|
||||
});
|
||||
assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
|
||||
assert_equals("<think>Cogito</think>", builder.result().content);
|
||||
assert_equals("Ergo sum", builder.consume_rest());
|
||||
}
|
||||
}
|
||||
|
||||
static void test_regex() {
|
||||
auto test_throws = [](const std::string & input, const std::string & regex, const std::string & expected_exception_pattern = "") {
|
||||
common_chat_msg_parser builder(input, /* is_partial= */ false, {});
|
||||
assert_throws([&]() { builder.consume_regex(common_regex(regex)); }, expected_exception_pattern);
|
||||
};
|
||||
|
||||
test_throws("Hello, world!", "abc", "^abc$");
|
||||
test_throws("Hello, world!", "e", "^e$");
|
||||
|
||||
{
|
||||
common_chat_msg_parser builder("Hello, world!", /* is_partial= */ false, {});
|
||||
builder.consume_regex(common_regex("Hello"));
|
||||
assert_equals(", world!", builder.consume_rest());
|
||||
}
|
||||
|
||||
{
|
||||
// When in non partial mode, we can say whether the regex was consumed or not.
|
||||
common_chat_msg_parser builder("Hello,", /* is_partial= */ false, {});
|
||||
assert_equals(false, builder.try_consume_regex(common_regex("Hello, world!")).has_value());
|
||||
}
|
||||
{
|
||||
common_chat_msg_parser builder("Hello,", /* is_partial= */ false, {});
|
||||
auto res = builder.try_consume_regex(common_regex("H(el)l(?:o, world!)?"));
|
||||
assert_equals(true, res.has_value());
|
||||
// Verify captures
|
||||
assert_equals<size_t>(2, res->groups.size());
|
||||
assert_equals("Hell", builder.str(res->groups[0]));
|
||||
assert_equals("el", builder.str(res->groups[1]));
|
||||
// Verify position is after the match
|
||||
assert_equals<size_t>(4, builder.pos());
|
||||
assert_equals("o,", builder.consume_rest());
|
||||
}
|
||||
{
|
||||
// But in partial mode, we have a partial final match / can't decide, so we throw a partial exception.
|
||||
common_chat_msg_parser builder("Hello,", /* is_partial= */ true, {});
|
||||
assert_throws([&]() {
|
||||
builder.try_consume_regex(common_regex("Hello, world!"));
|
||||
}, "^Hello, world!$");
|
||||
}
|
||||
|
||||
// Now regardless of the mode, we can tell these aren't a match.
|
||||
for (const auto is_partial : {false, true}) {
|
||||
common_chat_msg_parser builder("Hello,", is_partial, {});
|
||||
assert_equals(false, builder.try_consume_regex(common_regex("a(b|c)(d|e)f")).has_value());
|
||||
}
|
||||
for (const auto is_partial : {false, true}) {
|
||||
common_chat_msg_parser builder("Hello,", is_partial, {});
|
||||
assert_equals(false, builder.try_consume_literal("Oh"));
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<std::string> barely_healable_jsons = {
|
||||
"{",
|
||||
"{\"",
|
||||
"{\"\\",
|
||||
"{\"n",
|
||||
"{\"name\"",
|
||||
"{\"name\":",
|
||||
"{\"name\":\"",
|
||||
"{\"name\":\"\\",
|
||||
"{\"name\":\"python",
|
||||
"{\"name\":\"python\\",
|
||||
"{\",",
|
||||
"{\":",
|
||||
"{\"[",
|
||||
"{\"]",
|
||||
"{\"{",
|
||||
"{\"}",
|
||||
"{\"1",
|
||||
"{\"name\":\",",
|
||||
"{\"name\":\":",
|
||||
"{\"name\":\"[",
|
||||
"{\"name\":\"]",
|
||||
"{\"name\":\"{",
|
||||
"{\"name\":\"}",
|
||||
"{\"name\":\"1",
|
||||
};
|
||||
|
||||
static void test(const std::string & input, bool is_partial, const std::vector<std::vector<std::string>> & args_paths, const std::vector<std::vector<std::string>> & content_paths, const std::string & expected) {
|
||||
common_chat_msg_parser builder(input, is_partial, {});
|
||||
auto js = builder.try_consume_json_with_dumped_args(args_paths, content_paths);
|
||||
assert_equals(true, js.has_value());
|
||||
assert_equals(is_partial, js->is_partial);
|
||||
assert_equals(expected, args_paths.size() == 1 && args_paths[0].empty() ? js->value.get<std::string>() : js->value.dump());
|
||||
}
|
||||
static void test_with_args(const std::string & input, const std::string & expected, bool parse_as_partial = true, bool is_partial = true) {
|
||||
common_chat_msg_parser builder(input, parse_as_partial, {});
|
||||
auto js = builder.try_consume_json_with_dumped_args({{"args"}}, {});
|
||||
assert_equals(true, js.has_value());
|
||||
assert_equals(is_partial, js->is_partial);
|
||||
assert_equals(expected, js->value.dump());
|
||||
}
|
||||
|
||||
static void test_json_with_dumped_args_no_args() {
|
||||
// Normal JSON, nothing to heal, nothing to dump
|
||||
test("{\"name\": \"python\"}", false, {}, {}, "{\"name\":\"python\"}");
|
||||
// Full json is args
|
||||
test("{\"name\": \"python\"}", false, {{}}, {}, "{\"name\":\"python\"}");
|
||||
|
||||
// If the arguments are further down, don't heal partial content.
|
||||
for (const auto & src : barely_healable_jsons) {
|
||||
test(src, true, {{"arguments"}}, {}, "{}");
|
||||
}
|
||||
// But heal content that isn't partial.
|
||||
test("{\"name\": \"python\"", true, {{"arguments"}}, {}, "{\"name\":\"python\"}");
|
||||
}
|
||||
|
||||
static void test_json_with_dumped_args() {
|
||||
|
||||
// Partial content.
|
||||
test("{\"content\": \"t", true, {}, {{"content"}}, "{\"content\":\"t\"}");
|
||||
test("{\"content\": \"", true, {}, {{"content"}}, "{\"content\":\"\"}");
|
||||
test("{\"content\": ", true, {}, {{"content"}}, "{}");
|
||||
|
||||
// If the entire JSON is the arguments, healing it them dumping it produces the same output as the input (just reformatted).
|
||||
test("{\"name\": \"python", true, {{}}, {}, "{\"name\":\"python");
|
||||
for (const auto & src : barely_healable_jsons) {
|
||||
test(src, true, {{}}, {}, src);
|
||||
}
|
||||
|
||||
// Full JSON w/ args
|
||||
for (auto parse_as_partial : {true, false}) {
|
||||
test_with_args(
|
||||
R"({"name": "python", "args": {"arg1": 1}})",
|
||||
R"({"name":"python","args":"{\"arg1\":1}"})",
|
||||
parse_as_partial,
|
||||
/* is_partial= */ false
|
||||
);
|
||||
}
|
||||
|
||||
// Partial JSON w/ partial args
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {")",
|
||||
R"({"foo":"bar","args":"{\""})"
|
||||
);
|
||||
// Partial args broken in object key
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"ar)",
|
||||
R"({"foo":"bar","args":"{\"ar"})"
|
||||
);
|
||||
// Partial args broken after object key
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"arg1")",
|
||||
R"({"foo":"bar","args":"{\"arg1\""})"
|
||||
);
|
||||
// Partial args broken before object value
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"arg1":)",
|
||||
R"({"foo":"bar","args":"{\"arg1\":"})"
|
||||
);
|
||||
// Partial args broken before object value (space)
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"arg1": )",
|
||||
R"({"foo":"bar","args":"{\"arg1\":"})"
|
||||
);
|
||||
// Partial args broken in object value that may not be complete (int)
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"arg1": 1)",
|
||||
R"({"foo":"bar","args":"{\"arg1\":"})"
|
||||
);
|
||||
// Partial args broken in object value that is complete (int)
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"arg1": 1 )",
|
||||
R"({"foo":"bar","args":"{\"arg1\":1"})"
|
||||
);
|
||||
// Partial args broken in object value that is incomplete (string)
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"arg1": ")",
|
||||
R"({"foo":"bar","args":"{\"arg1\":\""})"
|
||||
);
|
||||
// Partial args broken in object value that is complete (string)
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"arg1": "1")",
|
||||
R"({"foo":"bar","args":"{\"arg1\":\"1\""})"
|
||||
);
|
||||
// Partial args broken on array opening
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": [)",
|
||||
R"({"foo":"bar","args":"["})"
|
||||
);
|
||||
// Partial args broken on array value that is incomplete (int)
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": [1)",
|
||||
R"({"foo":"bar","args":"["})"
|
||||
);
|
||||
// Partial args broken on array value that is complete (int)
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": [1 )",
|
||||
R"({"foo":"bar","args":"[1"})"
|
||||
);
|
||||
// Partial args broken on array value that is complete (string)
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": ["1")",
|
||||
R"({"foo":"bar","args":"[\"1\""})"
|
||||
);
|
||||
// Partial args broken after array value
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": [1,)",
|
||||
R"({"foo":"bar","args":"[1,"})"
|
||||
);
|
||||
// Partial args broken on nested array
|
||||
test_with_args(
|
||||
R"({"foo": "bar", "args": {"arg1": [)",
|
||||
R"({"foo":"bar","args":"{\"arg1\":["})"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_positions() {
|
||||
{
|
||||
common_chat_msg_parser builder("Hello, world!", /* is_partial= */ false, {});
|
||||
assert_equals<size_t>(0, builder.pos());
|
||||
assert_throws([&]() { builder.move_to(100); });
|
||||
assert_equals<size_t>(0, builder.pos());
|
||||
assert_throws([&]() { builder.move_back(1); });
|
||||
assert_equals<size_t>(0, builder.pos());
|
||||
|
||||
builder.move_to(8);
|
||||
assert_equals<size_t>(8, builder.pos());
|
||||
builder.move_back(1);
|
||||
assert_equals<size_t>(7, builder.pos());
|
||||
assert_equals("world!", builder.consume_rest());
|
||||
|
||||
builder.move_to(0);
|
||||
assert_equals<size_t>(0, builder.pos());
|
||||
|
||||
assert_throws([&]() { builder.finish(); });
|
||||
assert_equals<size_t>(0, builder.pos());
|
||||
|
||||
builder.move_to(builder.input().size());
|
||||
builder.finish();
|
||||
}
|
||||
{
|
||||
common_chat_msg_parser builder("Hello, world!", /* is_partial= */ true, {});
|
||||
|
||||
builder.move_to(builder.input().size());
|
||||
assert_equals<size_t>(builder.input().size(), builder.pos());
|
||||
builder.finish();
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
test_positions();
|
||||
test_json_with_dumped_args_no_args();
|
||||
test_json_with_dumped_args();
|
||||
test_reasoning();
|
||||
test_regex();
|
||||
std::cout << "All tests passed!\n";
|
||||
return 0;
|
||||
}
|
||||
+755
-276
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,237 @@
|
||||
#include "common.h"
|
||||
#include "json-partial.h"
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
template <class T> static void assert_equals(const T & expected, const T & actual) {
|
||||
if (expected != actual) {
|
||||
std::cerr << "Expected: " << expected << std::endl;
|
||||
std::cerr << "Actual: " << actual << std::endl;
|
||||
std::cerr << std::flush;
|
||||
throw std::runtime_error("Test failed");
|
||||
}
|
||||
}
|
||||
|
||||
static void test_json_healing() {
|
||||
auto parse = [](const std::string & str) {
|
||||
std::cerr << "# Parsing: " << str << '\n';
|
||||
std::string::const_iterator it = str.begin();
|
||||
const auto end = str.end();
|
||||
common_json out;
|
||||
std::string healing_marker = "$llama.cpp.json$";
|
||||
if (common_json_parse(it, end, healing_marker, out)) {
|
||||
auto dump = out.json.dump();
|
||||
std::cerr << "Parsed: " << dump << '\n';
|
||||
std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
|
||||
std::string result;
|
||||
if (!out.healing_marker.json_dump_marker.empty()) {
|
||||
auto i = dump.find(out.healing_marker.json_dump_marker);
|
||||
if (i == std::string::npos) {
|
||||
throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
|
||||
}
|
||||
result = dump.substr(0, i);
|
||||
} else {
|
||||
result = dump;
|
||||
}
|
||||
std::cerr << "Result: " << result << '\n';
|
||||
if (string_starts_with(str, result)) {
|
||||
std::cerr << "Failure!\n";
|
||||
}
|
||||
// return dump;
|
||||
} else {
|
||||
throw std::runtime_error("Failed to parse: " + str);
|
||||
}
|
||||
|
||||
};
|
||||
auto parse_all = [&](const std::string & str) {
|
||||
for (size_t i = 1; i < str.size(); i++) {
|
||||
parse(str.substr(0, i));
|
||||
}
|
||||
};
|
||||
parse_all("{\"a\": \"b\"}");
|
||||
parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
|
||||
|
||||
parse_all("[{\"a\": \"b\"}]");
|
||||
|
||||
auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
|
||||
for (const auto & input : inputs) {
|
||||
common_json out;
|
||||
assert_equals(true, common_json_parse(input, "$foo", out));
|
||||
assert_equals<std::string>(expected, out.json.dump());
|
||||
assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
|
||||
}
|
||||
};
|
||||
// No healing needed:
|
||||
test(
|
||||
{
|
||||
R"([{"a":"b"}, "y"])",
|
||||
},
|
||||
R"([{"a":"b"},"y"])",
|
||||
""
|
||||
);
|
||||
// Partial literals can't be healed:
|
||||
test(
|
||||
{
|
||||
R"([1)",
|
||||
R"([tru)",
|
||||
R"([n)",
|
||||
R"([nul)",
|
||||
R"([23.2)",
|
||||
},
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a": 1)",
|
||||
R"({"a": tru)",
|
||||
R"({"a": n)",
|
||||
R"({"a": nul)",
|
||||
R"({"a": 23.2)",
|
||||
},
|
||||
R"({"a":"$foo"})",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({)",
|
||||
},
|
||||
R"({"$foo":1})",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([)",
|
||||
},
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
// Healing right after a full literal
|
||||
test(
|
||||
{
|
||||
R"(1 )",
|
||||
},
|
||||
R"(1)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"(true)",
|
||||
R"(true )",
|
||||
},
|
||||
R"(true)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"(null)",
|
||||
R"(null )",
|
||||
},
|
||||
R"(null)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([1 )",
|
||||
},
|
||||
R"([1,"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{})",
|
||||
R"([{} )",
|
||||
},
|
||||
R"([{},"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true)",
|
||||
},
|
||||
// TODO: detect the true/false/null literal was complete
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true )",
|
||||
},
|
||||
R"([true,"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true,)",
|
||||
},
|
||||
R"([true,"$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
// Test nesting
|
||||
test(
|
||||
{
|
||||
R"([{"a": [{"b": [{)",
|
||||
},
|
||||
R"([{"a":[{"b":[{"$foo":1}]}]}])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{"a": [{"b": [)",
|
||||
},
|
||||
R"([{"a":[{"b":["$foo"]}]}])",
|
||||
R"("$foo)"
|
||||
);
|
||||
|
||||
test(
|
||||
{
|
||||
R"([{"a": "b"})",
|
||||
R"([{"a": "b"} )",
|
||||
},
|
||||
R"([{"a":"b"},"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{"a": "b"},)",
|
||||
R"([{"a": "b"}, )",
|
||||
},
|
||||
R"([{"a":"b"},"$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code)",
|
||||
},
|
||||
R"({"code$foo":1})",
|
||||
R"($foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code\)",
|
||||
},
|
||||
R"({"code\\$foo":1})",
|
||||
R"(\$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code")",
|
||||
},
|
||||
R"({"code":"$foo"})",
|
||||
R"(:"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "key")",
|
||||
},
|
||||
R"({"key":"$foo"})",
|
||||
R"(:"$foo)"
|
||||
);
|
||||
}
|
||||
|
||||
int main() {
|
||||
test_json_healing();
|
||||
std::cerr << "All tests passed.\n";
|
||||
return 0;
|
||||
}
|
||||
@@ -98,7 +98,7 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
|
||||
sampler_tester tester(probs, probs_expected);
|
||||
|
||||
DUMP(&tester.cur_p);
|
||||
tester.apply(llama_sampler_init_top_p(p, 1));
|
||||
tester.apply(llama_sampler_init_top_p(p, 0));
|
||||
tester.apply(llama_sampler_init_dist (0));
|
||||
DUMP(&tester.cur_p);
|
||||
|
||||
@@ -109,7 +109,7 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
|
||||
sampler_tester tester(probs, probs_expected);
|
||||
|
||||
DUMP(&tester.cur_p);
|
||||
tester.apply(llama_sampler_init_min_p(p, 1));
|
||||
tester.apply(llama_sampler_init_min_p(p, 0));
|
||||
tester.apply(llama_sampler_init_dist (0));
|
||||
DUMP(&tester.cur_p);
|
||||
|
||||
@@ -130,7 +130,7 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
|
||||
sampler_tester tester(probs, probs_expected);
|
||||
|
||||
DUMP(&tester.cur_p);
|
||||
tester.apply(llama_sampler_init_typical(p, 1));
|
||||
tester.apply(llama_sampler_init_typical(p, 0));
|
||||
DUMP(&tester.cur_p);
|
||||
|
||||
tester.check();
|
||||
@@ -332,6 +332,7 @@ int main(void) {
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.74f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.05f);
|
||||
|
||||
printf("XTC should:\n");
|
||||
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.1f}, 0.99f, 0.09f);
|
||||
@@ -341,8 +342,8 @@ int main(void) {
|
||||
printf("XTC should not:\n");
|
||||
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.99f, 0.39f);
|
||||
|
||||
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
|
||||
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
|
||||
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
|
||||
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
|
||||
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
||||
|
||||
@@ -107,6 +107,7 @@
|
||||
// ultravox
|
||||
#define TN_CONV1D "a.conv1d.%d.%s"
|
||||
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
|
||||
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
|
||||
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
|
||||
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
|
||||
|
||||
@@ -128,6 +129,8 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_ULTRAVOX,
|
||||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -145,6 +148,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
|
||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
+470
-341
File diff suppressed because it is too large
Load Diff
+14
-1
@@ -4,6 +4,8 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// !!! Internal header, to be used by mtmd only !!!
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
struct clip_image_size {
|
||||
@@ -15,12 +17,22 @@ struct clip_image_f32;
|
||||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
enum clip_modality {
|
||||
CLIP_MODALITY_VISION,
|
||||
CLIP_MODALITY_AUDIO,
|
||||
};
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
enum ggml_log_level verbosity;
|
||||
};
|
||||
|
||||
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
struct clip_init_result {
|
||||
struct clip_ctx * ctx_v; // vision context
|
||||
struct clip_ctx * ctx_a; // audio context
|
||||
};
|
||||
|
||||
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
@@ -99,3 +111,4 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
|
||||
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
|
||||
|
||||
@@ -284,7 +284,9 @@ int main(int argc, char ** argv) {
|
||||
if (is_single_turn) {
|
||||
g_is_generating = true;
|
||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
for (size_t i = 0; i < params.image.size(); i++) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
}
|
||||
}
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
|
||||
+34
-34
@@ -12,17 +12,7 @@ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
|
||||
size_t n_tokens = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens_text;
|
||||
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
||||
n_tokens += n_tokens_text;
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
|
||||
}
|
||||
return n_tokens;
|
||||
}
|
||||
@@ -31,17 +21,7 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
||||
llama_pos n_pos = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens_text;
|
||||
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
||||
n_pos += n_tokens_text;
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
n_pos += mtmd_input_chunk_get_n_pos(chunk);
|
||||
}
|
||||
return n_pos;
|
||||
}
|
||||
@@ -86,7 +66,8 @@ struct decode_embd_batch {
|
||||
}
|
||||
}
|
||||
|
||||
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
@@ -105,6 +86,23 @@ struct decode_embd_batch {
|
||||
}
|
||||
}
|
||||
|
||||
// M-RoPE for audio
|
||||
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
@@ -166,18 +164,20 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
|
||||
return -1;
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
batch_embd.set_position_mrope_1d(n_past, seq_id);
|
||||
} else {
|
||||
GGML_ABORT("invalid chunk type for M-RoPE");
|
||||
}
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
||||
} else {
|
||||
batch_embd.set_position_normal(n_past, seq_id);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user